1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "MCTargetDesc/R600MCTargetDesc.h"
21 #include "R600RegisterInfo.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "llvm/Analysis/UniformityAnalysis.h"
25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
26 #include "llvm/CodeGen/SelectionDAG.h"
27 #include "llvm/CodeGen/SelectionDAGISel.h"
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/Support/ErrorHandling.h"
31
32 #ifdef EXPENSIVE_CHECKS
33 #include "llvm/Analysis/LoopInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #endif
36
37 #define DEBUG_TYPE "amdgpu-isel"
38
39 using namespace llvm;
40
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44
45 namespace {
stripBitcast(SDValue Val)46 static SDValue stripBitcast(SDValue Val) {
47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48 }
49
50 // Figure out if this is really an extract of the high 16-bits of a dword.
isExtractHiElt(SDValue In,SDValue & Out)51 static bool isExtractHiElt(SDValue In, SDValue &Out) {
52 In = stripBitcast(In);
53
54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56 if (!Idx->isOne())
57 return false;
58 Out = In.getOperand(0);
59 return true;
60 }
61 }
62
63 if (In.getOpcode() != ISD::TRUNCATE)
64 return false;
65
66 SDValue Srl = In.getOperand(0);
67 if (Srl.getOpcode() == ISD::SRL) {
68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69 if (ShiftAmt->getZExtValue() == 16) {
70 Out = stripBitcast(Srl.getOperand(0));
71 return true;
72 }
73 }
74 }
75
76 return false;
77 }
78
79 // Look through operations that obscure just looking at the low 16-bits of the
80 // same register.
stripExtractLoElt(SDValue In)81 static SDValue stripExtractLoElt(SDValue In) {
82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83 SDValue Idx = In.getOperand(1);
84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85 return In.getOperand(0);
86 }
87
88 if (In.getOpcode() == ISD::TRUNCATE) {
89 SDValue Src = In.getOperand(0);
90 if (Src.getValueType().getSizeInBits() == 32)
91 return stripBitcast(Src);
92 }
93
94 return In;
95 }
96
97 } // end anonymous namespace
98
99 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
100 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101 false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)102 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
103 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
104 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
105 #ifdef EXPENSIVE_CHECKS
106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
108 #endif
109 INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111 false)
112
113 /// This pass converts a legalized DAG into a AMDGPU-specific
114 // DAG, ready for instruction scheduling.
115 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118 }
119
AMDGPUDAGToDAGISel(TargetMachine & TM,CodeGenOptLevel OptLevel)120 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(TM, OptLevel) {}
123
runOnMachineFunction(MachineFunction & MF)124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125 Subtarget = &MF.getSubtarget<GCNSubtarget>();
126 Subtarget->checkSubtargetFeatures(MF.getFunction());
127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
128 return SelectionDAGISel::runOnMachineFunction(MF);
129 }
130
fp16SrcZerosHighBits(unsigned Opc) const131 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132 // XXX - only need to list legal operations.
133 switch (Opc) {
134 case ISD::FADD:
135 case ISD::FSUB:
136 case ISD::FMUL:
137 case ISD::FDIV:
138 case ISD::FREM:
139 case ISD::FCANONICALIZE:
140 case ISD::UINT_TO_FP:
141 case ISD::SINT_TO_FP:
142 case ISD::FABS:
143 // Fabs is lowered to a bit operation, but it's an and which will clear the
144 // high bits anyway.
145 case ISD::FSQRT:
146 case ISD::FSIN:
147 case ISD::FCOS:
148 case ISD::FPOWI:
149 case ISD::FPOW:
150 case ISD::FLOG:
151 case ISD::FLOG2:
152 case ISD::FLOG10:
153 case ISD::FEXP:
154 case ISD::FEXP2:
155 case ISD::FCEIL:
156 case ISD::FTRUNC:
157 case ISD::FRINT:
158 case ISD::FNEARBYINT:
159 case ISD::FROUNDEVEN:
160 case ISD::FROUND:
161 case ISD::FFLOOR:
162 case ISD::FMINNUM:
163 case ISD::FMAXNUM:
164 case ISD::FLDEXP:
165 case AMDGPUISD::FRACT:
166 case AMDGPUISD::CLAMP:
167 case AMDGPUISD::COS_HW:
168 case AMDGPUISD::SIN_HW:
169 case AMDGPUISD::FMIN3:
170 case AMDGPUISD::FMAX3:
171 case AMDGPUISD::FMED3:
172 case AMDGPUISD::FMAD_FTZ:
173 case AMDGPUISD::RCP:
174 case AMDGPUISD::RSQ:
175 case AMDGPUISD::RCP_IFLAG:
176 // On gfx10, all 16-bit instructions preserve the high bits.
177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178 case ISD::FP_ROUND:
179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180 // high bits on gfx9.
181 // TODO: If we had the source node we could see if the source was fma/mad
182 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
183 case ISD::FMA:
184 case ISD::FMAD:
185 case AMDGPUISD::DIV_FIXUP:
186 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187 default:
188 // fcopysign, select and others may be lowered to 32-bit bit operations
189 // which don't zero the high bits.
190 return false;
191 }
192 }
193
runOnMachineFunction(MachineFunction & MF)194 bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
195 #ifdef EXPENSIVE_CHECKS
196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198 for (auto &L : LI->getLoopsInPreorder()) {
199 assert(L->isLCSSAForm(DT));
200 }
201 #endif
202 return SelectionDAGISelLegacy::runOnMachineFunction(MF);
203 }
204
getAnalysisUsage(AnalysisUsage & AU) const205 void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
206 AU.addRequired<AMDGPUArgumentUsageInfo>();
207 AU.addRequired<UniformityInfoWrapperPass>();
208 #ifdef EXPENSIVE_CHECKS
209 AU.addRequired<DominatorTreeWrapperPass>();
210 AU.addRequired<LoopInfoWrapperPass>();
211 #endif
212 SelectionDAGISelLegacy::getAnalysisUsage(AU);
213 }
214
matchLoadD16FromBuildVector(SDNode * N) const215 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
216 assert(Subtarget->d16PreservesUnusedBits());
217 MVT VT = N->getValueType(0).getSimpleVT();
218 if (VT != MVT::v2i16 && VT != MVT::v2f16)
219 return false;
220
221 SDValue Lo = N->getOperand(0);
222 SDValue Hi = N->getOperand(1);
223
224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225
226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230 // Need to check for possible indirect dependencies on the other half of the
231 // vector to avoid introducing a cycle.
232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234
235 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
236 SDValue Ops[] = {
237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238 };
239
240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241 if (LdHi->getMemoryVT() == MVT::i8) {
242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
243 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
244 } else {
245 assert(LdHi->getMemoryVT() == MVT::i16);
246 }
247
248 SDValue NewLoadHi =
249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250 Ops, LdHi->getMemoryVT(),
251 LdHi->getMemOperand());
252
253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255 return true;
256 }
257
258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262 if (LdLo && Lo.hasOneUse()) {
263 SDValue TiedIn = getHi16Elt(Hi);
264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265 return false;
266
267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269 if (LdLo->getMemoryVT() == MVT::i8) {
270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
271 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
272 } else {
273 assert(LdLo->getMemoryVT() == MVT::i16);
274 }
275
276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277
278 SDValue Ops[] = {
279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280 };
281
282 SDValue NewLoadLo =
283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284 Ops, LdLo->getMemoryVT(),
285 LdLo->getMemOperand());
286
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289 return true;
290 }
291
292 return false;
293 }
294
PreprocessISelDAG()295 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
296 if (!Subtarget->d16PreservesUnusedBits())
297 return;
298
299 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
300
301 bool MadeChange = false;
302 while (Position != CurDAG->allnodes_begin()) {
303 SDNode *N = &*--Position;
304 if (N->use_empty())
305 continue;
306
307 switch (N->getOpcode()) {
308 case ISD::BUILD_VECTOR:
309 // TODO: Match load d16 from shl (extload:i16), 16
310 MadeChange |= matchLoadD16FromBuildVector(N);
311 break;
312 default:
313 break;
314 }
315 }
316
317 if (MadeChange) {
318 CurDAG->RemoveDeadNodes();
319 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320 CurDAG->dump(););
321 }
322 }
323
isInlineImmediate(const SDNode * N) const324 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325 if (N->isUndef())
326 return true;
327
328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330 return TII->isInlineConstant(C->getAPIntValue());
331
332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333 return TII->isInlineConstant(C->getValueAPF());
334
335 return false;
336 }
337
338 /// Determine the register class for \p OpNo
339 /// \returns The register class of the virtual register that will be used for
340 /// the given operand number \OpNo or NULL if the register class cannot be
341 /// determined.
getOperandRegClass(SDNode * N,unsigned OpNo) const342 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343 unsigned OpNo) const {
344 if (!N->isMachineOpcode()) {
345 if (N->getOpcode() == ISD::CopyToReg) {
346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347 if (Reg.isVirtual()) {
348 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
349 return MRI.getRegClass(Reg);
350 }
351
352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353 return TRI->getPhysRegBaseClass(Reg);
354 }
355
356 return nullptr;
357 }
358
359 switch (N->getMachineOpcode()) {
360 default: {
361 const MCInstrDesc &Desc =
362 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363 unsigned OpIdx = Desc.getNumDefs() + OpNo;
364 if (OpIdx >= Desc.getNumOperands())
365 return nullptr;
366 int RegClass = Desc.operands()[OpIdx].RegClass;
367 if (RegClass == -1)
368 return nullptr;
369
370 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371 }
372 case AMDGPU::REG_SEQUENCE: {
373 unsigned RCID = N->getConstantOperandVal(0);
374 const TargetRegisterClass *SuperRC =
375 Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377 SDValue SubRegOp = N->getOperand(OpNo + 1);
378 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380 SubRegIdx);
381 }
382 }
383 }
384
glueCopyToOp(SDNode * N,SDValue NewChain,SDValue Glue) const385 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386 SDValue Glue) const {
387 SmallVector <SDValue, 8> Ops;
388 Ops.push_back(NewChain); // Replace the chain.
389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390 Ops.push_back(N->getOperand(i));
391
392 Ops.push_back(Glue);
393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394 }
395
glueCopyToM0(SDNode * N,SDValue Val) const396 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
397 const SITargetLowering& Lowering =
398 *static_cast<const SITargetLowering*>(getTargetLowering());
399
400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401
402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403 return glueCopyToOp(N, M0, M0.getValue(1));
404 }
405
glueCopyToM0LDSInit(SDNode * N) const406 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409 if (Subtarget->ldsRequiresM0Init())
410 return glueCopyToM0(
411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
413 MachineFunction &MF = CurDAG->getMachineFunction();
414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415 return
416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417 }
418 return N;
419 }
420
buildSMovImm64(SDLoc & DL,uint64_t Imm,EVT VT) const421 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422 EVT VT) const {
423 SDNode *Lo = CurDAG->getMachineNode(
424 AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
426 SDNode *Hi = CurDAG->getMachineNode(
427 AMDGPU::S_MOV_B32, DL, MVT::i32,
428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429 const SDValue Ops[] = {
430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433
434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435 }
436
SelectBuildVector(SDNode * N,unsigned RegClassID)437 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438 EVT VT = N->getValueType(0);
439 unsigned NumVectorElts = VT.getVectorNumElements();
440 EVT EltVT = VT.getVectorElementType();
441 SDLoc DL(N);
442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443
444 if (NumVectorElts == 1) {
445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446 RegClass);
447 return;
448 }
449
450 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
451 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
452 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
453 uint64_t C = 0;
454 bool AllConst = true;
455 unsigned EltSize = EltVT.getSizeInBits();
456 for (unsigned I = 0; I < NumVectorElts; ++I) {
457 SDValue Op = N->getOperand(I);
458 if (Op.isUndef()) {
459 AllConst = false;
460 break;
461 }
462 uint64_t Val;
463 if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
464 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
465 } else
466 Val = cast<ConstantSDNode>(Op)->getZExtValue();
467 C |= Val << (EltSize * I);
468 }
469 if (AllConst) {
470 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
471 MachineSDNode *Copy =
472 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
473 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
474 RegClass);
475 return;
476 }
477 }
478
479 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
480 "supported yet");
481 // 32 = Max Num Vector Elements
482 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
483 // 1 = Vector Register Class
484 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
485
486 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
487 bool IsRegSeq = true;
488 unsigned NOps = N->getNumOperands();
489 for (unsigned i = 0; i < NOps; i++) {
490 // XXX: Why is this here?
491 if (isa<RegisterSDNode>(N->getOperand(i))) {
492 IsRegSeq = false;
493 break;
494 }
495 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
496 : R600RegisterInfo::getSubRegFromChannel(i);
497 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
498 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
499 }
500 if (NOps != NumVectorElts) {
501 // Fill in the missing undef elements if this was a scalar_to_vector.
502 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
503 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
504 DL, EltVT);
505 for (unsigned i = NOps; i < NumVectorElts; ++i) {
506 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
507 : R600RegisterInfo::getSubRegFromChannel(i);
508 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
509 RegSeqArgs[1 + (2 * i) + 1] =
510 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
511 }
512 }
513
514 if (!IsRegSeq)
515 SelectCode(N);
516 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
517 }
518
SelectVectorShuffle(SDNode * N)519 void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
520 EVT VT = N->getValueType(0);
521 EVT EltVT = VT.getVectorElementType();
522
523 // TODO: Handle 16-bit element vectors with even aligned masks.
524 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
525 VT.getVectorNumElements() != 2) {
526 SelectCode(N);
527 return;
528 }
529
530 auto *SVN = cast<ShuffleVectorSDNode>(N);
531
532 SDValue Src0 = SVN->getOperand(0);
533 SDValue Src1 = SVN->getOperand(1);
534 ArrayRef<int> Mask = SVN->getMask();
535 SDLoc DL(N);
536
537 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
538 Mask[0] < 4 && Mask[1] < 4);
539
540 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
541 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
542 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
543 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
544
545 if (Mask[0] < 0) {
546 Src0SubReg = Src1SubReg;
547 MachineSDNode *ImpDef =
548 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
549 VSrc0 = SDValue(ImpDef, 0);
550 }
551
552 if (Mask[1] < 0) {
553 Src1SubReg = Src0SubReg;
554 MachineSDNode *ImpDef =
555 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
556 VSrc1 = SDValue(ImpDef, 0);
557 }
558
559 // SGPR case needs to lower to copies.
560 //
561 // Also use subregister extract when we can directly blend the registers with
562 // a simple subregister copy.
563 //
564 // TODO: Maybe we should fold this out earlier
565 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
566 Src1SubReg == AMDGPU::sub0) {
567 // The low element of the result always comes from src0.
568 // The high element of the result always comes from src1.
569 // op_sel selects the high half of src0.
570 // op_sel_hi selects the high half of src1.
571
572 unsigned Src0OpSel =
573 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
574 unsigned Src1OpSel =
575 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
576
577 // Enable op_sel_hi to avoid printing it. This should have no effect on the
578 // result.
579 Src0OpSel |= SISrcMods::OP_SEL_1;
580 Src1OpSel |= SISrcMods::OP_SEL_1;
581
582 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
583 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
584 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
585
586 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
587 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
588 ZeroMods, // clamp
589 ZeroMods, // op_sel
590 ZeroMods, // op_sel_hi
591 ZeroMods, // neg_lo
592 ZeroMods}); // neg_hi
593 return;
594 }
595
596 SDValue ResultElt0 =
597 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
598 SDValue ResultElt1 =
599 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
600
601 const SDValue Ops[] = {
602 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
603 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
604 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
605 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
606 }
607
Select(SDNode * N)608 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
609 unsigned int Opc = N->getOpcode();
610 if (N->isMachineOpcode()) {
611 N->setNodeId(-1);
612 return; // Already selected.
613 }
614
615 // isa<MemSDNode> almost works but is slightly too permissive for some DS
616 // intrinsics.
617 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
618 N = glueCopyToM0LDSInit(N);
619 SelectCode(N);
620 return;
621 }
622
623 switch (Opc) {
624 default:
625 break;
626 // We are selecting i64 ADD here instead of custom lower it during
627 // DAG legalization, so we can fold some i64 ADDs used for address
628 // calculation into the LOAD and STORE instructions.
629 case ISD::ADDC:
630 case ISD::ADDE:
631 case ISD::SUBC:
632 case ISD::SUBE: {
633 if (N->getValueType(0) != MVT::i64)
634 break;
635
636 SelectADD_SUB_I64(N);
637 return;
638 }
639 case ISD::UADDO_CARRY:
640 case ISD::USUBO_CARRY:
641 if (N->getValueType(0) != MVT::i32)
642 break;
643
644 SelectAddcSubb(N);
645 return;
646 case ISD::UADDO:
647 case ISD::USUBO: {
648 SelectUADDO_USUBO(N);
649 return;
650 }
651 case AMDGPUISD::FMUL_W_CHAIN: {
652 SelectFMUL_W_CHAIN(N);
653 return;
654 }
655 case AMDGPUISD::FMA_W_CHAIN: {
656 SelectFMA_W_CHAIN(N);
657 return;
658 }
659
660 case ISD::SCALAR_TO_VECTOR:
661 case ISD::BUILD_VECTOR: {
662 EVT VT = N->getValueType(0);
663 unsigned NumVectorElts = VT.getVectorNumElements();
664 if (VT.getScalarSizeInBits() == 16) {
665 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
666 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
667 ReplaceNode(N, Packed);
668 return;
669 }
670 }
671
672 break;
673 }
674
675 assert(VT.getVectorElementType().bitsEq(MVT::i32));
676 unsigned RegClassID =
677 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
678 SelectBuildVector(N, RegClassID);
679 return;
680 }
681 case ISD::VECTOR_SHUFFLE:
682 SelectVectorShuffle(N);
683 return;
684 case ISD::BUILD_PAIR: {
685 SDValue RC, SubReg0, SubReg1;
686 SDLoc DL(N);
687 if (N->getValueType(0) == MVT::i128) {
688 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
689 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
690 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
691 } else if (N->getValueType(0) == MVT::i64) {
692 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
693 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
694 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
695 } else {
696 llvm_unreachable("Unhandled value type for BUILD_PAIR");
697 }
698 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
699 N->getOperand(1), SubReg1 };
700 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
701 N->getValueType(0), Ops));
702 return;
703 }
704
705 case ISD::Constant:
706 case ISD::ConstantFP: {
707 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
708 Subtarget->has64BitLiterals())
709 break;
710
711 uint64_t Imm;
712 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
713 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
714 if (AMDGPU::isValid32BitLiteral(Imm, true))
715 break;
716 } else {
717 ConstantSDNode *C = cast<ConstantSDNode>(N);
718 Imm = C->getZExtValue();
719 if (AMDGPU::isValid32BitLiteral(Imm, false))
720 break;
721 }
722
723 SDLoc DL(N);
724 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
725 return;
726 }
727 case AMDGPUISD::BFE_I32:
728 case AMDGPUISD::BFE_U32: {
729 // There is a scalar version available, but unlike the vector version which
730 // has a separate operand for the offset and width, the scalar version packs
731 // the width and offset into a single operand. Try to move to the scalar
732 // version if the offsets are constant, so that we can try to keep extended
733 // loads of kernel arguments in SGPRs.
734
735 // TODO: Technically we could try to pattern match scalar bitshifts of
736 // dynamic values, but it's probably not useful.
737 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
738 if (!Offset)
739 break;
740
741 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
742 if (!Width)
743 break;
744
745 bool Signed = Opc == AMDGPUISD::BFE_I32;
746
747 uint32_t OffsetVal = Offset->getZExtValue();
748 uint32_t WidthVal = Width->getZExtValue();
749
750 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
751 WidthVal));
752 return;
753 }
754 case AMDGPUISD::DIV_SCALE: {
755 SelectDIV_SCALE(N);
756 return;
757 }
758 case AMDGPUISD::MAD_I64_I32:
759 case AMDGPUISD::MAD_U64_U32: {
760 SelectMAD_64_32(N);
761 return;
762 }
763 case ISD::SMUL_LOHI:
764 case ISD::UMUL_LOHI:
765 return SelectMUL_LOHI(N);
766 case ISD::CopyToReg: {
767 const SITargetLowering& Lowering =
768 *static_cast<const SITargetLowering*>(getTargetLowering());
769 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
770 break;
771 }
772 case ISD::AND:
773 case ISD::SRL:
774 case ISD::SRA:
775 case ISD::SIGN_EXTEND_INREG:
776 if (N->getValueType(0) != MVT::i32)
777 break;
778
779 SelectS_BFE(N);
780 return;
781 case ISD::BRCOND:
782 SelectBRCOND(N);
783 return;
784 case ISD::FP_EXTEND:
785 SelectFP_EXTEND(N);
786 return;
787 case AMDGPUISD::CVT_PKRTZ_F16_F32:
788 case AMDGPUISD::CVT_PKNORM_I16_F32:
789 case AMDGPUISD::CVT_PKNORM_U16_F32:
790 case AMDGPUISD::CVT_PK_U16_U32:
791 case AMDGPUISD::CVT_PK_I16_I32: {
792 // Hack around using a legal type if f16 is illegal.
793 if (N->getValueType(0) == MVT::i32) {
794 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
795 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
796 { N->getOperand(0), N->getOperand(1) });
797 SelectCode(N);
798 return;
799 }
800
801 break;
802 }
803 case ISD::INTRINSIC_W_CHAIN: {
804 SelectINTRINSIC_W_CHAIN(N);
805 return;
806 }
807 case ISD::INTRINSIC_WO_CHAIN: {
808 SelectINTRINSIC_WO_CHAIN(N);
809 return;
810 }
811 case ISD::INTRINSIC_VOID: {
812 SelectINTRINSIC_VOID(N);
813 return;
814 }
815 case AMDGPUISD::WAVE_ADDRESS: {
816 SelectWAVE_ADDRESS(N);
817 return;
818 }
819 case ISD::STACKRESTORE: {
820 SelectSTACKRESTORE(N);
821 return;
822 }
823 }
824
825 SelectCode(N);
826 }
827
isUniformBr(const SDNode * N) const828 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
829 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
830 const Instruction *Term = BB->getTerminator();
831 return Term->getMetadata("amdgpu.uniform") ||
832 Term->getMetadata("structurizecfg.uniform");
833 }
834
isUnneededShiftMask(const SDNode * N,unsigned ShAmtBits) const835 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
836 unsigned ShAmtBits) const {
837 assert(N->getOpcode() == ISD::AND);
838
839 const APInt &RHS = N->getConstantOperandAPInt(1);
840 if (RHS.countr_one() >= ShAmtBits)
841 return true;
842
843 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
844 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
845 }
846
getBaseWithOffsetUsingSplitOR(SelectionDAG & DAG,SDValue Addr,SDValue & N0,SDValue & N1)847 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
848 SDValue &N0, SDValue &N1) {
849 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
850 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
851 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
852 // (i64 (bitcast (v2i32 (build_vector
853 // (or (extract_vector_elt V, 0), OFFSET),
854 // (extract_vector_elt V, 1)))))
855 SDValue Lo = Addr.getOperand(0).getOperand(0);
856 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
857 SDValue BaseLo = Lo.getOperand(0);
858 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
859 // Check that split base (Lo and Hi) are extracted from the same one.
860 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
861 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
862 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
863 // Lo is statically extracted from index 0.
864 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
865 BaseLo.getConstantOperandVal(1) == 0 &&
866 // Hi is statically extracted from index 0.
867 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
868 BaseHi.getConstantOperandVal(1) == 1) {
869 N0 = BaseLo.getOperand(0).getOperand(0);
870 N1 = Lo.getOperand(1);
871 return true;
872 }
873 }
874 }
875 return false;
876 }
877
isBaseWithConstantOffset64(SDValue Addr,SDValue & LHS,SDValue & RHS) const878 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
879 SDValue &RHS) const {
880 if (CurDAG->isBaseWithConstantOffset(Addr)) {
881 LHS = Addr.getOperand(0);
882 RHS = Addr.getOperand(1);
883 return true;
884 }
885
886 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
887 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
888 return true;
889 }
890
891 return false;
892 }
893
getPassName() const894 StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
895 return "AMDGPU DAG->DAG Pattern Instruction Selection";
896 }
897
AMDGPUISelDAGToDAGPass(TargetMachine & TM)898 AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
899 : SelectionDAGISelPass(
900 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
901
902 PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)903 AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
904 MachineFunctionAnalysisManager &MFAM) {
905 #ifdef EXPENSIVE_CHECKS
906 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
907 .getManager();
908 auto &F = MF.getFunction();
909 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
910 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
911 for (auto &L : LI.getLoopsInPreorder())
912 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
913 #endif
914 return SelectionDAGISelPass::run(MF, MFAM);
915 }
916
917 //===----------------------------------------------------------------------===//
918 // Complex Patterns
919 //===----------------------------------------------------------------------===//
920
SelectADDRVTX_READ(SDValue Addr,SDValue & Base,SDValue & Offset)921 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
922 SDValue &Offset) {
923 return false;
924 }
925
SelectADDRIndirect(SDValue Addr,SDValue & Base,SDValue & Offset)926 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
927 SDValue &Offset) {
928 ConstantSDNode *C;
929 SDLoc DL(Addr);
930
931 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
932 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
933 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
934 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
935 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
936 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
937 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
938 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
939 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
940 Base = Addr.getOperand(0);
941 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
942 } else {
943 Base = Addr;
944 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
945 }
946
947 return true;
948 }
949
getMaterializedScalarImm32(int64_t Val,const SDLoc & DL) const950 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
951 const SDLoc &DL) const {
952 SDNode *Mov = CurDAG->getMachineNode(
953 AMDGPU::S_MOV_B32, DL, MVT::i32,
954 CurDAG->getTargetConstant(Val, DL, MVT::i32));
955 return SDValue(Mov, 0);
956 }
957
958 // FIXME: Should only handle uaddo_carry/usubo_carry
SelectADD_SUB_I64(SDNode * N)959 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
960 SDLoc DL(N);
961 SDValue LHS = N->getOperand(0);
962 SDValue RHS = N->getOperand(1);
963
964 unsigned Opcode = N->getOpcode();
965 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
966 bool ProduceCarry =
967 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
968 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
969
970 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
971 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
972
973 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
974 DL, MVT::i32, LHS, Sub0);
975 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
976 DL, MVT::i32, LHS, Sub1);
977
978 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
979 DL, MVT::i32, RHS, Sub0);
980 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
981 DL, MVT::i32, RHS, Sub1);
982
983 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
984
985 static const unsigned OpcMap[2][2][2] = {
986 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
987 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
988 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
989 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
990
991 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
992 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
993
994 SDNode *AddLo;
995 if (!ConsumeCarry) {
996 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
997 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
998 } else {
999 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1000 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1001 }
1002 SDValue AddHiArgs[] = {
1003 SDValue(Hi0, 0),
1004 SDValue(Hi1, 0),
1005 SDValue(AddLo, 1)
1006 };
1007 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1008
1009 SDValue RegSequenceArgs[] = {
1010 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1011 SDValue(AddLo,0),
1012 Sub0,
1013 SDValue(AddHi,0),
1014 Sub1,
1015 };
1016 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1017 MVT::i64, RegSequenceArgs);
1018
1019 if (ProduceCarry) {
1020 // Replace the carry-use
1021 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1022 }
1023
1024 // Replace the remaining uses.
1025 ReplaceNode(N, RegSequence);
1026 }
1027
SelectAddcSubb(SDNode * N)1028 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1029 SDValue LHS = N->getOperand(0);
1030 SDValue RHS = N->getOperand(1);
1031 SDValue CI = N->getOperand(2);
1032
1033 if (N->isDivergent()) {
1034 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1035 : AMDGPU::V_SUBB_U32_e64;
1036 CurDAG->SelectNodeTo(
1037 N, Opc, N->getVTList(),
1038 {LHS, RHS, CI,
1039 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1040 } else {
1041 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1042 : AMDGPU::S_SUB_CO_PSEUDO;
1043 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1044 }
1045 }
1046
SelectUADDO_USUBO(SDNode * N)1047 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1048 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1049 // carry out despite the _i32 name. These were renamed in VI to _U32.
1050 // FIXME: We should probably rename the opcodes here.
1051 bool IsAdd = N->getOpcode() == ISD::UADDO;
1052 bool IsVALU = N->isDivergent();
1053
1054 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1055 ++UI)
1056 if (UI.getUse().getResNo() == 1) {
1057 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1058 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1059 IsVALU = true;
1060 break;
1061 }
1062 }
1063
1064 if (IsVALU) {
1065 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1066
1067 CurDAG->SelectNodeTo(
1068 N, Opc, N->getVTList(),
1069 {N->getOperand(0), N->getOperand(1),
1070 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1071 } else {
1072 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1073 : AMDGPU::S_USUBO_PSEUDO;
1074
1075 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1076 {N->getOperand(0), N->getOperand(1)});
1077 }
1078 }
1079
SelectFMA_W_CHAIN(SDNode * N)1080 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1081 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1082 SDValue Ops[10];
1083
1084 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1085 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1086 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1087 Ops[8] = N->getOperand(0);
1088 Ops[9] = N->getOperand(4);
1089
1090 // If there are no source modifiers, prefer fmac over fma because it can use
1091 // the smaller VOP2 encoding.
1092 bool UseFMAC = Subtarget->hasDLInsts() &&
1093 cast<ConstantSDNode>(Ops[0])->isZero() &&
1094 cast<ConstantSDNode>(Ops[2])->isZero() &&
1095 cast<ConstantSDNode>(Ops[4])->isZero();
1096 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1097 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1098 }
1099
SelectFMUL_W_CHAIN(SDNode * N)1100 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1101 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1102 SDValue Ops[8];
1103
1104 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1105 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1106 Ops[6] = N->getOperand(0);
1107 Ops[7] = N->getOperand(3);
1108
1109 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1110 }
1111
1112 // We need to handle this here because tablegen doesn't support matching
1113 // instructions with multiple outputs.
SelectDIV_SCALE(SDNode * N)1114 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1115 EVT VT = N->getValueType(0);
1116
1117 assert(VT == MVT::f32 || VT == MVT::f64);
1118
1119 unsigned Opc
1120 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1121
1122 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1123 // omod
1124 SDValue Ops[8];
1125 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1126 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1127 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1128 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1129 }
1130
1131 // We need to handle this here because tablegen doesn't support matching
1132 // instructions with multiple outputs.
SelectMAD_64_32(SDNode * N)1133 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1134 SDLoc SL(N);
1135 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1136 unsigned Opc;
1137 if (Subtarget->hasMADIntraFwdBug())
1138 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1139 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1140 else
1141 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1142
1143 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1144 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1145 Clamp };
1146 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1147 }
1148
1149 // We need to handle this here because tablegen doesn't support matching
1150 // instructions with multiple outputs.
SelectMUL_LOHI(SDNode * N)1151 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1152 SDLoc SL(N);
1153 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1154 unsigned Opc;
1155 if (Subtarget->hasMADIntraFwdBug())
1156 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1157 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1158 else
1159 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1160
1161 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1162 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1163 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1164 SDNode *Mad = CurDAG->getMachineNode(
1165 Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
1166 if (!SDValue(N, 0).use_empty()) {
1167 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1168 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1169 MVT::i32, SDValue(Mad, 0), Sub0);
1170 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1171 }
1172 if (!SDValue(N, 1).use_empty()) {
1173 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1174 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1175 MVT::i32, SDValue(Mad, 0), Sub1);
1176 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1177 }
1178 CurDAG->RemoveDeadNode(N);
1179 }
1180
isDSOffsetLegal(SDValue Base,unsigned Offset) const1181 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1182 if (!isUInt<16>(Offset))
1183 return false;
1184
1185 if (!Base || Subtarget->hasUsableDSOffset() ||
1186 Subtarget->unsafeDSOffsetFoldingEnabled())
1187 return true;
1188
1189 // On Southern Islands instruction with a negative base value and an offset
1190 // don't seem to work.
1191 return CurDAG->SignBitIsZero(Base);
1192 }
1193
SelectDS1Addr1Offset(SDValue Addr,SDValue & Base,SDValue & Offset) const1194 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1195 SDValue &Offset) const {
1196 SDLoc DL(Addr);
1197 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1198 SDValue N0 = Addr.getOperand(0);
1199 SDValue N1 = Addr.getOperand(1);
1200 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1201 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1202 // (add n0, c0)
1203 Base = N0;
1204 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1205 return true;
1206 }
1207 } else if (Addr.getOpcode() == ISD::SUB) {
1208 // sub C, x -> add (sub 0, x), C
1209 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1210 int64_t ByteOffset = C->getSExtValue();
1211 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1212 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1213
1214 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1215 // the known bits in isDSOffsetLegal. We need to emit the selected node
1216 // here, so this is thrown away.
1217 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1218 Zero, Addr.getOperand(1));
1219
1220 if (isDSOffsetLegal(Sub, ByteOffset)) {
1221 SmallVector<SDValue, 3> Opnds;
1222 Opnds.push_back(Zero);
1223 Opnds.push_back(Addr.getOperand(1));
1224
1225 // FIXME: Select to VOP3 version for with-carry.
1226 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1227 if (Subtarget->hasAddNoCarry()) {
1228 SubOp = AMDGPU::V_SUB_U32_e64;
1229 Opnds.push_back(
1230 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1231 }
1232
1233 MachineSDNode *MachineSub =
1234 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1235
1236 Base = SDValue(MachineSub, 0);
1237 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1238 return true;
1239 }
1240 }
1241 }
1242 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1243 // If we have a constant address, prefer to put the constant into the
1244 // offset. This can save moves to load the constant address since multiple
1245 // operations can share the zero base address register, and enables merging
1246 // into read2 / write2 instructions.
1247
1248 SDLoc DL(Addr);
1249
1250 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1251 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1252 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1253 DL, MVT::i32, Zero);
1254 Base = SDValue(MovZero, 0);
1255 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1256 return true;
1257 }
1258 }
1259
1260 // default case
1261 Base = Addr;
1262 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1263 return true;
1264 }
1265
isDSOffset2Legal(SDValue Base,unsigned Offset0,unsigned Offset1,unsigned Size) const1266 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1267 unsigned Offset1,
1268 unsigned Size) const {
1269 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1270 return false;
1271 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1272 return false;
1273
1274 if (!Base || Subtarget->hasUsableDSOffset() ||
1275 Subtarget->unsafeDSOffsetFoldingEnabled())
1276 return true;
1277
1278 // On Southern Islands instruction with a negative base value and an offset
1279 // don't seem to work.
1280 return CurDAG->SignBitIsZero(Base);
1281 }
1282
1283 // Return whether the operation has NoUnsignedWrap property.
isNoUnsignedWrap(SDValue Addr)1284 static bool isNoUnsignedWrap(SDValue Addr) {
1285 return (Addr.getOpcode() == ISD::ADD &&
1286 Addr->getFlags().hasNoUnsignedWrap()) ||
1287 Addr->getOpcode() == ISD::OR;
1288 }
1289
1290 // Check that the base address of flat scratch load/store in the form of `base +
1291 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1292 // requirement). We always treat the first operand as the base address here.
isFlatScratchBaseLegal(SDValue Addr) const1293 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1294 if (isNoUnsignedWrap(Addr))
1295 return true;
1296
1297 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1298 // values.
1299 if (Subtarget->hasSignedScratchOffsets())
1300 return true;
1301
1302 auto LHS = Addr.getOperand(0);
1303 auto RHS = Addr.getOperand(1);
1304
1305 // If the immediate offset is negative and within certain range, the base
1306 // address cannot also be negative. If the base is also negative, the sum
1307 // would be either negative or much larger than the valid range of scratch
1308 // memory a thread can access.
1309 ConstantSDNode *ImmOp = nullptr;
1310 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1311 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1312 return true;
1313 }
1314
1315 return CurDAG->SignBitIsZero(LHS);
1316 }
1317
1318 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1319 // of: SGPR + VGPR.
isFlatScratchBaseLegalSV(SDValue Addr) const1320 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1321 if (isNoUnsignedWrap(Addr))
1322 return true;
1323
1324 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1325 // values.
1326 if (Subtarget->hasSignedScratchOffsets())
1327 return true;
1328
1329 auto LHS = Addr.getOperand(0);
1330 auto RHS = Addr.getOperand(1);
1331 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1332 }
1333
1334 // Check address value in SGPR/VGPR are legal for flat scratch in the form
1335 // of: SGPR + VGPR + Imm.
isFlatScratchBaseLegalSVImm(SDValue Addr) const1336 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1337 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1338 // values.
1339 if (AMDGPU::isGFX12Plus(*Subtarget))
1340 return true;
1341
1342 auto Base = Addr.getOperand(0);
1343 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1344 // If the immediate offset is negative and within certain range, the base
1345 // address cannot also be negative. If the base is also negative, the sum
1346 // would be either negative or much larger than the valid range of scratch
1347 // memory a thread can access.
1348 if (isNoUnsignedWrap(Base) &&
1349 (isNoUnsignedWrap(Addr) ||
1350 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1351 return true;
1352
1353 auto LHS = Base.getOperand(0);
1354 auto RHS = Base.getOperand(1);
1355 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1356 }
1357
1358 // TODO: If offset is too big, put low 16-bit into offset.
SelectDS64Bit4ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1359 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1360 SDValue &Offset0,
1361 SDValue &Offset1) const {
1362 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1363 }
1364
SelectDS128Bit8ByteAligned(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1) const1365 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1366 SDValue &Offset0,
1367 SDValue &Offset1) const {
1368 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1369 }
1370
SelectDSReadWrite2(SDValue Addr,SDValue & Base,SDValue & Offset0,SDValue & Offset1,unsigned Size) const1371 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1372 SDValue &Offset0, SDValue &Offset1,
1373 unsigned Size) const {
1374 SDLoc DL(Addr);
1375
1376 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1377 SDValue N0 = Addr.getOperand(0);
1378 SDValue N1 = Addr.getOperand(1);
1379 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1380 unsigned OffsetValue0 = C1->getZExtValue();
1381 unsigned OffsetValue1 = OffsetValue0 + Size;
1382
1383 // (add n0, c0)
1384 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1385 Base = N0;
1386 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1387 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1388 return true;
1389 }
1390 } else if (Addr.getOpcode() == ISD::SUB) {
1391 // sub C, x -> add (sub 0, x), C
1392 if (const ConstantSDNode *C =
1393 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1394 unsigned OffsetValue0 = C->getZExtValue();
1395 unsigned OffsetValue1 = OffsetValue0 + Size;
1396
1397 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1398 SDLoc DL(Addr);
1399 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400
1401 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1402 // the known bits in isDSOffsetLegal. We need to emit the selected node
1403 // here, so this is thrown away.
1404 SDValue Sub =
1405 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1406
1407 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1408 SmallVector<SDValue, 3> Opnds;
1409 Opnds.push_back(Zero);
1410 Opnds.push_back(Addr.getOperand(1));
1411 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1412 if (Subtarget->hasAddNoCarry()) {
1413 SubOp = AMDGPU::V_SUB_U32_e64;
1414 Opnds.push_back(
1415 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1416 }
1417
1418 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1419 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1420
1421 Base = SDValue(MachineSub, 0);
1422 Offset0 =
1423 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1424 Offset1 =
1425 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1426 return true;
1427 }
1428 }
1429 }
1430 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1431 unsigned OffsetValue0 = CAddr->getZExtValue();
1432 unsigned OffsetValue1 = OffsetValue0 + Size;
1433
1434 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1435 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1436 MachineSDNode *MovZero =
1437 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1438 Base = SDValue(MovZero, 0);
1439 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1440 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1441 return true;
1442 }
1443 }
1444
1445 // default case
1446
1447 Base = Addr;
1448 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1449 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1450 return true;
1451 }
1452
SelectMUBUF(SDValue Addr,SDValue & Ptr,SDValue & VAddr,SDValue & SOffset,SDValue & Offset,SDValue & Offen,SDValue & Idxen,SDValue & Addr64) const1453 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1454 SDValue &SOffset, SDValue &Offset,
1455 SDValue &Offen, SDValue &Idxen,
1456 SDValue &Addr64) const {
1457 // Subtarget prefers to use flat instruction
1458 // FIXME: This should be a pattern predicate and not reach here
1459 if (Subtarget->useFlatForGlobal())
1460 return false;
1461
1462 SDLoc DL(Addr);
1463
1464 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1465 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1466 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1467 SOffset = Subtarget->hasRestrictedSOffset()
1468 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1469 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1470
1471 ConstantSDNode *C1 = nullptr;
1472 SDValue N0 = Addr;
1473 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1474 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1475 if (isUInt<32>(C1->getZExtValue()))
1476 N0 = Addr.getOperand(0);
1477 else
1478 C1 = nullptr;
1479 }
1480
1481 if (N0.getOpcode() == ISD::ADD) {
1482 // (add N2, N3) -> addr64, or
1483 // (add (add N2, N3), C1) -> addr64
1484 SDValue N2 = N0.getOperand(0);
1485 SDValue N3 = N0.getOperand(1);
1486 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1487
1488 if (N2->isDivergent()) {
1489 if (N3->isDivergent()) {
1490 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1491 // addr64, and construct the resource from a 0 address.
1492 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1493 VAddr = N0;
1494 } else {
1495 // N2 is divergent, N3 is not.
1496 Ptr = N3;
1497 VAddr = N2;
1498 }
1499 } else {
1500 // N2 is not divergent.
1501 Ptr = N2;
1502 VAddr = N3;
1503 }
1504 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1505 } else if (N0->isDivergent()) {
1506 // N0 is divergent. Use it as the addr64, and construct the resource from a
1507 // 0 address.
1508 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1509 VAddr = N0;
1510 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1511 } else {
1512 // N0 -> offset, or
1513 // (N0 + C1) -> offset
1514 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1515 Ptr = N0;
1516 }
1517
1518 if (!C1) {
1519 // No offset.
1520 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1521 return true;
1522 }
1523
1524 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1525 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1526 // Legal offset for instruction.
1527 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1528 return true;
1529 }
1530
1531 // Illegal offset, store it in soffset.
1532 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1533 SOffset =
1534 SDValue(CurDAG->getMachineNode(
1535 AMDGPU::S_MOV_B32, DL, MVT::i32,
1536 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1537 0);
1538 return true;
1539 }
1540
SelectMUBUFAddr64(SDValue Addr,SDValue & SRsrc,SDValue & VAddr,SDValue & SOffset,SDValue & Offset) const1541 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1542 SDValue &VAddr, SDValue &SOffset,
1543 SDValue &Offset) const {
1544 SDValue Ptr, Offen, Idxen, Addr64;
1545
1546 // addr64 bit was removed for volcanic islands.
1547 // FIXME: This should be a pattern predicate and not reach here
1548 if (!Subtarget->hasAddr64())
1549 return false;
1550
1551 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1552 return false;
1553
1554 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1555 if (C->getSExtValue()) {
1556 SDLoc DL(Addr);
1557
1558 const SITargetLowering& Lowering =
1559 *static_cast<const SITargetLowering*>(getTargetLowering());
1560
1561 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1562 return true;
1563 }
1564
1565 return false;
1566 }
1567
foldFrameIndex(SDValue N) const1568 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1569 SDLoc DL(N);
1570
1571 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1572 SDValue TFI =
1573 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1574
1575 // We rebase the base address into an absolute stack address and hence
1576 // use constant 0 for soffset. This value must be retained until
1577 // frame elimination and eliminateFrameIndex will choose the appropriate
1578 // frame register if need be.
1579 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1580 }
1581
SelectMUBUFScratchOffen(SDNode * Parent,SDValue Addr,SDValue & Rsrc,SDValue & VAddr,SDValue & SOffset,SDValue & ImmOffset) const1582 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1583 SDValue Addr, SDValue &Rsrc,
1584 SDValue &VAddr, SDValue &SOffset,
1585 SDValue &ImmOffset) const {
1586
1587 SDLoc DL(Addr);
1588 MachineFunction &MF = CurDAG->getMachineFunction();
1589 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1590
1591 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1592
1593 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1594 int64_t Imm = CAddr->getSExtValue();
1595 const int64_t NullPtr =
1596 AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1597 // Don't fold null pointer.
1598 if (Imm != NullPtr) {
1599 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1600 SDValue HighBits =
1601 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1602 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1603 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1604 VAddr = SDValue(MovHighBits, 0);
1605
1606 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1607 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1608 return true;
1609 }
1610 }
1611
1612 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1613 // (add n0, c1)
1614
1615 SDValue N0 = Addr.getOperand(0);
1616 uint64_t C1 = Addr.getConstantOperandVal(1);
1617
1618 // Offsets in vaddr must be positive if range checking is enabled.
1619 //
1620 // The total computation of vaddr + soffset + offset must not overflow. If
1621 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1622 // overflowing.
1623 //
1624 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1625 // always perform a range check. If a negative vaddr base index was used,
1626 // this would fail the range check. The overall address computation would
1627 // compute a valid address, but this doesn't happen due to the range
1628 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1629 //
1630 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1631 // MUBUF vaddr, but not on older subtargets which can only do this if the
1632 // sign bit is known 0.
1633 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1634 if (TII->isLegalMUBUFImmOffset(C1) &&
1635 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1636 CurDAG->SignBitIsZero(N0))) {
1637 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1638 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1639 return true;
1640 }
1641 }
1642
1643 // (node)
1644 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1645 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1646 return true;
1647 }
1648
IsCopyFromSGPR(const SIRegisterInfo & TRI,SDValue Val)1649 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1650 if (Val.getOpcode() != ISD::CopyFromReg)
1651 return false;
1652 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1653 if (!Reg.isPhysical())
1654 return false;
1655 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1656 return RC && TRI.isSGPRClass(RC);
1657 }
1658
SelectMUBUFScratchOffset(SDNode * Parent,SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1659 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1660 SDValue Addr,
1661 SDValue &SRsrc,
1662 SDValue &SOffset,
1663 SDValue &Offset) const {
1664 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1665 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1666 MachineFunction &MF = CurDAG->getMachineFunction();
1667 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1668 SDLoc DL(Addr);
1669
1670 // CopyFromReg <sgpr>
1671 if (IsCopyFromSGPR(*TRI, Addr)) {
1672 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1673 SOffset = Addr;
1674 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1675 return true;
1676 }
1677
1678 ConstantSDNode *CAddr;
1679 if (Addr.getOpcode() == ISD::ADD) {
1680 // Add (CopyFromReg <sgpr>) <constant>
1681 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1682 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1683 return false;
1684 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1685 return false;
1686
1687 SOffset = Addr.getOperand(0);
1688 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1689 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1690 // <constant>
1691 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1692 } else {
1693 return false;
1694 }
1695
1696 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1697
1698 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1699 return true;
1700 }
1701
SelectMUBUFOffset(SDValue Addr,SDValue & SRsrc,SDValue & SOffset,SDValue & Offset) const1702 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1703 SDValue &SOffset, SDValue &Offset
1704 ) const {
1705 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1706 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1707
1708 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1709 return false;
1710
1711 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1712 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1713 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1714 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1715 maskTrailingOnes<uint64_t>(32); // Size
1716 SDLoc DL(Addr);
1717
1718 const SITargetLowering& Lowering =
1719 *static_cast<const SITargetLowering*>(getTargetLowering());
1720
1721 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1722 return true;
1723 }
1724 return false;
1725 }
1726
SelectBUFSOffset(SDValue ByteOffsetNode,SDValue & SOffset) const1727 bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1728 SDValue &SOffset) const {
1729 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1730 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1731 return true;
1732 }
1733
1734 SOffset = ByteOffsetNode;
1735 return true;
1736 }
1737
1738 // Find a load or store from corresponding pattern root.
1739 // Roots may be build_vector, bitconvert or their combinations.
findMemSDNode(SDNode * N)1740 static MemSDNode* findMemSDNode(SDNode *N) {
1741 N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1742 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1743 return MN;
1744 assert(isa<BuildVectorSDNode>(N));
1745 for (SDValue V : N->op_values())
1746 if (MemSDNode *MN =
1747 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1748 return MN;
1749 llvm_unreachable("cannot find MemSDNode in the pattern!");
1750 }
1751
SelectFlatOffsetImpl(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset,uint64_t FlatVariant) const1752 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1753 SDValue &VAddr, SDValue &Offset,
1754 uint64_t FlatVariant) const {
1755 int64_t OffsetVal = 0;
1756
1757 unsigned AS = findMemSDNode(N)->getAddressSpace();
1758
1759 bool CanHaveFlatSegmentOffsetBug =
1760 Subtarget->hasFlatSegmentOffsetBug() &&
1761 FlatVariant == SIInstrFlags::FLAT &&
1762 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1763
1764 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1765 SDValue N0, N1;
1766 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1767 (FlatVariant != SIInstrFlags::FlatScratch ||
1768 isFlatScratchBaseLegal(Addr))) {
1769 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1770
1771 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1772 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1773 Addr = N0;
1774 OffsetVal = COffsetVal;
1775 } else {
1776 // If the offset doesn't fit, put the low bits into the offset field and
1777 // add the rest.
1778 //
1779 // For a FLAT instruction the hardware decides whether to access
1780 // global/scratch/shared memory based on the high bits of vaddr,
1781 // ignoring the offset field, so we have to ensure that when we add
1782 // remainder to vaddr it still points into the same underlying object.
1783 // The easiest way to do that is to make sure that we split the offset
1784 // into two pieces that are both >= 0 or both <= 0.
1785
1786 SDLoc DL(N);
1787 uint64_t RemainderOffset;
1788
1789 std::tie(OffsetVal, RemainderOffset) =
1790 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1791
1792 SDValue AddOffsetLo =
1793 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1794 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1795
1796 if (Addr.getValueType().getSizeInBits() == 32) {
1797 SmallVector<SDValue, 3> Opnds;
1798 Opnds.push_back(N0);
1799 Opnds.push_back(AddOffsetLo);
1800 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1801 if (Subtarget->hasAddNoCarry()) {
1802 AddOp = AMDGPU::V_ADD_U32_e64;
1803 Opnds.push_back(Clamp);
1804 }
1805 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1806 } else {
1807 // TODO: Should this try to use a scalar add pseudo if the base address
1808 // is uniform and saddr is usable?
1809 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1810 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1811
1812 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1813 DL, MVT::i32, N0, Sub0);
1814 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1815 DL, MVT::i32, N0, Sub1);
1816
1817 SDValue AddOffsetHi =
1818 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1819
1820 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1821
1822 SDNode *Add =
1823 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1824 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1825
1826 SDNode *Addc = CurDAG->getMachineNode(
1827 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1828 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1829
1830 SDValue RegSequenceArgs[] = {
1831 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1832 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1833
1834 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1835 MVT::i64, RegSequenceArgs),
1836 0);
1837 }
1838 }
1839 }
1840 }
1841
1842 VAddr = Addr;
1843 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1844 return true;
1845 }
1846
SelectFlatOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1847 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1848 SDValue &VAddr,
1849 SDValue &Offset) const {
1850 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1851 }
1852
SelectGlobalOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1853 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1854 SDValue &VAddr,
1855 SDValue &Offset) const {
1856 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1857 }
1858
SelectScratchOffset(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & Offset) const1859 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1860 SDValue &VAddr,
1861 SDValue &Offset) const {
1862 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1863 SIInstrFlags::FlatScratch);
1864 }
1865
1866 // If this matches zero_extend i32:x, return x
matchZExtFromI32(SDValue Op)1867 static SDValue matchZExtFromI32(SDValue Op) {
1868 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1869 return SDValue();
1870
1871 SDValue ExtSrc = Op.getOperand(0);
1872 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1873 }
1874
1875 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
SelectGlobalSAddr(SDNode * N,SDValue Addr,SDValue & SAddr,SDValue & VOffset,SDValue & Offset) const1876 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1877 SDValue Addr,
1878 SDValue &SAddr,
1879 SDValue &VOffset,
1880 SDValue &Offset) const {
1881 int64_t ImmOffset = 0;
1882
1883 // Match the immediate offset first, which canonically is moved as low as
1884 // possible.
1885
1886 SDValue LHS, RHS;
1887 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1888 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1889 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1890
1891 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1892 SIInstrFlags::FlatGlobal)) {
1893 Addr = LHS;
1894 ImmOffset = COffsetVal;
1895 } else if (!LHS->isDivergent()) {
1896 if (COffsetVal > 0) {
1897 SDLoc SL(N);
1898 // saddr + large_offset -> saddr +
1899 // (voffset = large_offset & ~MaxOffset) +
1900 // (large_offset & MaxOffset);
1901 int64_t SplitImmOffset, RemainderOffset;
1902 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1903 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1904
1905 if (isUInt<32>(RemainderOffset)) {
1906 SDNode *VMov = CurDAG->getMachineNode(
1907 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1908 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1909 VOffset = SDValue(VMov, 0);
1910 SAddr = LHS;
1911 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1912 return true;
1913 }
1914 }
1915
1916 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1917 // is 1 we would need to perform 1 or 2 extra moves for each half of
1918 // the constant and it is better to do a scalar add and then issue a
1919 // single VALU instruction to materialize zero. Otherwise it is less
1920 // instructions to perform VALU adds with immediates or inline literals.
1921 unsigned NumLiterals =
1922 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1923 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1924 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1925 return false;
1926 }
1927 }
1928
1929 // Match the variable offset.
1930 if (Addr.getOpcode() == ISD::ADD) {
1931 LHS = Addr.getOperand(0);
1932 RHS = Addr.getOperand(1);
1933
1934 if (!LHS->isDivergent()) {
1935 // add (i64 sgpr), (zero_extend (i32 vgpr))
1936 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1937 SAddr = LHS;
1938 VOffset = ZextRHS;
1939 }
1940 }
1941
1942 if (!SAddr && !RHS->isDivergent()) {
1943 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1944 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1945 SAddr = RHS;
1946 VOffset = ZextLHS;
1947 }
1948 }
1949
1950 if (SAddr) {
1951 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1952 return true;
1953 }
1954 }
1955
1956 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1957 isa<ConstantSDNode>(Addr))
1958 return false;
1959
1960 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1961 // moves required to copy a 64-bit SGPR to VGPR.
1962 SAddr = Addr;
1963 SDNode *VMov =
1964 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1965 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1966 VOffset = SDValue(VMov, 0);
1967 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1968 return true;
1969 }
1970
SelectSAddrFI(SelectionDAG * CurDAG,SDValue SAddr)1971 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1972 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1973 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1974 } else if (SAddr.getOpcode() == ISD::ADD &&
1975 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1976 // Materialize this into a scalar move for scalar address to avoid
1977 // readfirstlane.
1978 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1979 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1980 FI->getValueType(0));
1981 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1982 MVT::i32, TFI, SAddr.getOperand(1)),
1983 0);
1984 }
1985
1986 return SAddr;
1987 }
1988
1989 // Match (32-bit SGPR base) + sext(imm offset)
SelectScratchSAddr(SDNode * Parent,SDValue Addr,SDValue & SAddr,SDValue & Offset) const1990 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1991 SDValue &SAddr,
1992 SDValue &Offset) const {
1993 if (Addr->isDivergent())
1994 return false;
1995
1996 SDLoc DL(Addr);
1997
1998 int64_t COffsetVal = 0;
1999
2000 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2001 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2002 SAddr = Addr.getOperand(0);
2003 } else {
2004 SAddr = Addr;
2005 }
2006
2007 SAddr = SelectSAddrFI(CurDAG, SAddr);
2008
2009 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2010
2011 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2012 SIInstrFlags::FlatScratch)) {
2013 int64_t SplitImmOffset, RemainderOffset;
2014 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2015 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
2016
2017 COffsetVal = SplitImmOffset;
2018
2019 SDValue AddOffset =
2020 SAddr.getOpcode() == ISD::TargetFrameIndex
2021 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2022 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2023 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2024 SAddr, AddOffset),
2025 0);
2026 }
2027
2028 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2029
2030 return true;
2031 }
2032
2033 // Check whether the flat scratch SVS swizzle bug affects this access.
checkFlatScratchSVSSwizzleBug(SDValue VAddr,SDValue SAddr,uint64_t ImmOffset) const2034 bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2035 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2036 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2037 return false;
2038
2039 // The bug affects the swizzling of SVS accesses if there is any carry out
2040 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2041 // voffset to (soffset + inst_offset).
2042 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2043 KnownBits SKnown =
2044 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2045 KnownBits::makeConstant(APInt(32, ImmOffset,
2046 /*isSigned=*/true)));
2047 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2048 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2049 return (VMax & 3) + (SMax & 3) >= 4;
2050 }
2051
SelectScratchSVAddr(SDNode * N,SDValue Addr,SDValue & VAddr,SDValue & SAddr,SDValue & Offset) const2052 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2053 SDValue &VAddr, SDValue &SAddr,
2054 SDValue &Offset) const {
2055 int64_t ImmOffset = 0;
2056
2057 SDValue LHS, RHS;
2058 SDValue OrigAddr = Addr;
2059 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2060 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2061 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2062
2063 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2064 SIInstrFlags::FlatScratch)) {
2065 Addr = LHS;
2066 ImmOffset = COffsetVal;
2067 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2068 SDLoc SL(N);
2069 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2070 // (large_offset & MaxOffset);
2071 int64_t SplitImmOffset, RemainderOffset;
2072 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2073 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
2074
2075 if (isUInt<32>(RemainderOffset)) {
2076 SDNode *VMov = CurDAG->getMachineNode(
2077 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2078 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2079 VAddr = SDValue(VMov, 0);
2080 SAddr = LHS;
2081 if (!isFlatScratchBaseLegal(Addr))
2082 return false;
2083 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2084 return false;
2085 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2086 return true;
2087 }
2088 }
2089 }
2090
2091 if (Addr.getOpcode() != ISD::ADD)
2092 return false;
2093
2094 LHS = Addr.getOperand(0);
2095 RHS = Addr.getOperand(1);
2096
2097 if (!LHS->isDivergent() && RHS->isDivergent()) {
2098 SAddr = LHS;
2099 VAddr = RHS;
2100 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2101 SAddr = RHS;
2102 VAddr = LHS;
2103 } else {
2104 return false;
2105 }
2106
2107 if (OrigAddr != Addr) {
2108 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2109 return false;
2110 } else {
2111 if (!isFlatScratchBaseLegalSV(OrigAddr))
2112 return false;
2113 }
2114
2115 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2116 return false;
2117 SAddr = SelectSAddrFI(CurDAG, SAddr);
2118 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2119 return true;
2120 }
2121
2122 // For unbuffered smem loads, it is illegal for the Immediate Offset to be
2123 // negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2124 // Handle the case where the Immediate Offset + SOffset is negative.
isSOffsetLegalWithImmOffset(SDValue * SOffset,bool Imm32Only,bool IsBuffer,int64_t ImmOffset) const2125 bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2126 bool Imm32Only,
2127 bool IsBuffer,
2128 int64_t ImmOffset) const {
2129 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2130 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2131 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2132 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2133 return false;
2134 }
2135
2136 return true;
2137 }
2138
2139 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2140 // not null) offset. If Imm32Only is true, match only 32-bit immediate
2141 // offsets available on CI.
SelectSMRDOffset(SDValue ByteOffsetNode,SDValue * SOffset,SDValue * Offset,bool Imm32Only,bool IsBuffer,bool HasSOffset,int64_t ImmOffset) const2142 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2143 SDValue *SOffset, SDValue *Offset,
2144 bool Imm32Only, bool IsBuffer,
2145 bool HasSOffset,
2146 int64_t ImmOffset) const {
2147 assert((!SOffset || !Offset) &&
2148 "Cannot match both soffset and offset at the same time!");
2149
2150 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2151 if (!C) {
2152 if (!SOffset)
2153 return false;
2154
2155 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2156 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2157 *SOffset = ByteOffsetNode;
2158 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2159 ImmOffset);
2160 }
2161 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2162 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2163 *SOffset = ByteOffsetNode.getOperand(0);
2164 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2165 ImmOffset);
2166 }
2167 }
2168 return false;
2169 }
2170
2171 SDLoc SL(ByteOffsetNode);
2172
2173 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2174 // offset for S_BUFFER instructions is unsigned.
2175 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2176 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2177 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2178 if (EncodedOffset && Offset && !Imm32Only) {
2179 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2180 return true;
2181 }
2182
2183 // SGPR and literal offsets are unsigned.
2184 if (ByteOffset < 0)
2185 return false;
2186
2187 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2188 if (EncodedOffset && Offset && Imm32Only) {
2189 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2190 return true;
2191 }
2192
2193 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2194 return false;
2195
2196 if (SOffset) {
2197 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2198 *SOffset = SDValue(
2199 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2200 return true;
2201 }
2202
2203 return false;
2204 }
2205
Expand32BitAddress(SDValue Addr) const2206 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2207 if (Addr.getValueType() != MVT::i32)
2208 return Addr;
2209
2210 // Zero-extend a 32-bit address.
2211 SDLoc SL(Addr);
2212
2213 const MachineFunction &MF = CurDAG->getMachineFunction();
2214 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2215 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2216 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2217
2218 const SDValue Ops[] = {
2219 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2220 Addr,
2221 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2222 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2223 0),
2224 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2225 };
2226
2227 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2228 Ops), 0);
2229 }
2230
2231 // Match a base and an immediate (if Offset is not null) or an SGPR (if
2232 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2233 // true, match only 32-bit immediate offsets available on CI.
SelectSMRDBaseOffset(SDValue Addr,SDValue & SBase,SDValue * SOffset,SDValue * Offset,bool Imm32Only,bool IsBuffer,bool HasSOffset,int64_t ImmOffset) const2234 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2235 SDValue *SOffset, SDValue *Offset,
2236 bool Imm32Only, bool IsBuffer,
2237 bool HasSOffset,
2238 int64_t ImmOffset) const {
2239 if (SOffset && Offset) {
2240 assert(!Imm32Only && !IsBuffer);
2241 SDValue B;
2242
2243 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2244 return false;
2245
2246 int64_t ImmOff = 0;
2247 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2248 ImmOff = C->getSExtValue();
2249
2250 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2251 ImmOff);
2252 }
2253
2254 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2255 // wraparound, because s_load instructions perform the addition in 64 bits.
2256 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2257 !Addr->getFlags().hasNoUnsignedWrap())
2258 return false;
2259
2260 SDValue N0, N1;
2261 // Extract the base and offset if possible.
2262 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2263 N0 = Addr.getOperand(0);
2264 N1 = Addr.getOperand(1);
2265 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2266 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2267 }
2268 if (!N0 || !N1)
2269 return false;
2270
2271 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2272 ImmOffset)) {
2273 SBase = N0;
2274 return true;
2275 }
2276 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2277 ImmOffset)) {
2278 SBase = N1;
2279 return true;
2280 }
2281 return false;
2282 }
2283
SelectSMRD(SDValue Addr,SDValue & SBase,SDValue * SOffset,SDValue * Offset,bool Imm32Only) const2284 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2285 SDValue *SOffset, SDValue *Offset,
2286 bool Imm32Only) const {
2287 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2288 SBase = Expand32BitAddress(SBase);
2289 return true;
2290 }
2291
2292 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2293 SBase = Expand32BitAddress(Addr);
2294 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2295 return true;
2296 }
2297
2298 return false;
2299 }
2300
SelectSMRDImm(SDValue Addr,SDValue & SBase,SDValue & Offset) const2301 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2302 SDValue &Offset) const {
2303 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2304 }
2305
SelectSMRDImm32(SDValue Addr,SDValue & SBase,SDValue & Offset) const2306 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2307 SDValue &Offset) const {
2308 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2309 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2310 /* Imm32Only */ true);
2311 }
2312
SelectSMRDSgpr(SDValue Addr,SDValue & SBase,SDValue & SOffset) const2313 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2314 SDValue &SOffset) const {
2315 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2316 }
2317
SelectSMRDSgprImm(SDValue Addr,SDValue & SBase,SDValue & SOffset,SDValue & Offset) const2318 bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2319 SDValue &SOffset,
2320 SDValue &Offset) const {
2321 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2322 }
2323
SelectSMRDBufferImm(SDValue N,SDValue & Offset) const2324 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2325 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2326 /* Imm32Only */ false, /* IsBuffer */ true);
2327 }
2328
SelectSMRDBufferImm32(SDValue N,SDValue & Offset) const2329 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2330 SDValue &Offset) const {
2331 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2332 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2333 /* Imm32Only */ true, /* IsBuffer */ true);
2334 }
2335
SelectSMRDBufferSgprImm(SDValue N,SDValue & SOffset,SDValue & Offset) const2336 bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2337 SDValue &Offset) const {
2338 // Match the (soffset + offset) pair as a 32-bit register base and
2339 // an immediate offset.
2340 return N.getValueType() == MVT::i32 &&
2341 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2342 &Offset, /* Imm32Only */ false,
2343 /* IsBuffer */ true);
2344 }
2345
SelectMOVRELOffset(SDValue Index,SDValue & Base,SDValue & Offset) const2346 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2347 SDValue &Base,
2348 SDValue &Offset) const {
2349 SDLoc DL(Index);
2350
2351 if (CurDAG->isBaseWithConstantOffset(Index)) {
2352 SDValue N0 = Index.getOperand(0);
2353 SDValue N1 = Index.getOperand(1);
2354 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2355
2356 // (add n0, c0)
2357 // Don't peel off the offset (c0) if doing so could possibly lead
2358 // the base (n0) to be negative.
2359 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2360 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2361 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2362 Base = N0;
2363 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2364 return true;
2365 }
2366 }
2367
2368 if (isa<ConstantSDNode>(Index))
2369 return false;
2370
2371 Base = Index;
2372 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2373 return true;
2374 }
2375
getBFE32(bool IsSigned,const SDLoc & DL,SDValue Val,uint32_t Offset,uint32_t Width)2376 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2377 SDValue Val, uint32_t Offset,
2378 uint32_t Width) {
2379 if (Val->isDivergent()) {
2380 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2381 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2382 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2383
2384 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2385 }
2386 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2387 // Transformation function, pack the offset and width of a BFE into
2388 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2389 // source, bits [5:0] contain the offset and bits [22:16] the width.
2390 uint32_t PackedVal = Offset | (Width << 16);
2391 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2392
2393 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2394 }
2395
SelectS_BFEFromShifts(SDNode * N)2396 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2397 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2398 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2399 // Predicate: 0 < b <= c < 32
2400
2401 const SDValue &Shl = N->getOperand(0);
2402 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2403 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2404
2405 if (B && C) {
2406 uint32_t BVal = B->getZExtValue();
2407 uint32_t CVal = C->getZExtValue();
2408
2409 if (0 < BVal && BVal <= CVal && CVal < 32) {
2410 bool Signed = N->getOpcode() == ISD::SRA;
2411 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2412 32 - CVal));
2413 return;
2414 }
2415 }
2416 SelectCode(N);
2417 }
2418
SelectS_BFE(SDNode * N)2419 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2420 switch (N->getOpcode()) {
2421 case ISD::AND:
2422 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2423 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2424 // Predicate: isMask(mask)
2425 const SDValue &Srl = N->getOperand(0);
2426 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2427 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2428
2429 if (Shift && Mask) {
2430 uint32_t ShiftVal = Shift->getZExtValue();
2431 uint32_t MaskVal = Mask->getZExtValue();
2432
2433 if (isMask_32(MaskVal)) {
2434 uint32_t WidthVal = llvm::popcount(MaskVal);
2435 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2436 WidthVal));
2437 return;
2438 }
2439 }
2440 }
2441 break;
2442 case ISD::SRL:
2443 if (N->getOperand(0).getOpcode() == ISD::AND) {
2444 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2445 // Predicate: isMask(mask >> b)
2446 const SDValue &And = N->getOperand(0);
2447 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2448 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2449
2450 if (Shift && Mask) {
2451 uint32_t ShiftVal = Shift->getZExtValue();
2452 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2453
2454 if (isMask_32(MaskVal)) {
2455 uint32_t WidthVal = llvm::popcount(MaskVal);
2456 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2457 WidthVal));
2458 return;
2459 }
2460 }
2461 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2462 SelectS_BFEFromShifts(N);
2463 return;
2464 }
2465 break;
2466 case ISD::SRA:
2467 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2468 SelectS_BFEFromShifts(N);
2469 return;
2470 }
2471 break;
2472
2473 case ISD::SIGN_EXTEND_INREG: {
2474 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2475 SDValue Src = N->getOperand(0);
2476 if (Src.getOpcode() != ISD::SRL)
2477 break;
2478
2479 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2480 if (!Amt)
2481 break;
2482
2483 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2484 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2485 Amt->getZExtValue(), Width));
2486 return;
2487 }
2488 }
2489
2490 SelectCode(N);
2491 }
2492
isCBranchSCC(const SDNode * N) const2493 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2494 assert(N->getOpcode() == ISD::BRCOND);
2495 if (!N->hasOneUse())
2496 return false;
2497
2498 SDValue Cond = N->getOperand(1);
2499 if (Cond.getOpcode() == ISD::CopyToReg)
2500 Cond = Cond.getOperand(2);
2501
2502 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2503 return false;
2504
2505 MVT VT = Cond.getOperand(0).getSimpleValueType();
2506 if (VT == MVT::i32)
2507 return true;
2508
2509 if (VT == MVT::i64) {
2510 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2511 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2512 Subtarget->hasScalarCompareEq64();
2513 }
2514
2515 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2516 return true;
2517
2518 return false;
2519 }
2520
combineBallotPattern(SDValue VCMP,bool & Negate)2521 static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2522 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2523 // Special case for amdgcn.ballot:
2524 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2525 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2526 // =>
2527 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2528 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2529 // Cond becomes a i(WaveSize) full mask value.
2530 // Note that ballot doesn't use SETEQ condition but its easy to support it
2531 // here for completeness, so in this case Negate is set true on return.
2532 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2533 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2534 isNullConstant(VCMP.getOperand(1))) {
2535
2536 auto Cond = VCMP.getOperand(0);
2537 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2538 Cond = Cond.getOperand(0);
2539
2540 if (isBoolSGPR(Cond)) {
2541 Negate = VCMP_CC == ISD::SETEQ;
2542 return Cond;
2543 }
2544 }
2545 return SDValue();
2546 }
2547
SelectBRCOND(SDNode * N)2548 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2549 SDValue Cond = N->getOperand(1);
2550
2551 if (Cond.isUndef()) {
2552 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2553 N->getOperand(2), N->getOperand(0));
2554 return;
2555 }
2556
2557 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2558
2559 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2560 bool AndExec = !UseSCCBr;
2561 bool Negate = false;
2562
2563 if (Cond.getOpcode() == ISD::SETCC &&
2564 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2565 SDValue VCMP = Cond->getOperand(0);
2566 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2567 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2568 isNullConstant(Cond->getOperand(1)) &&
2569 // We may encounter ballot.i64 in wave32 mode on -O0.
2570 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2571 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2572 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2573 // BRCOND i1 %C, %BB
2574 // =>
2575 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2576 // VCC = COPY i(WaveSize) %VCMP
2577 // S_CBRANCH_VCCNZ/VCCZ %BB
2578 Negate = CC == ISD::SETEQ;
2579 bool NegatedBallot = false;
2580 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2581 Cond = BallotCond;
2582 UseSCCBr = !BallotCond->isDivergent();
2583 Negate = Negate ^ NegatedBallot;
2584 } else {
2585 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2586 // selected as V_CMP, but this may change for uniform condition.
2587 Cond = VCMP;
2588 UseSCCBr = false;
2589 }
2590 }
2591 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2592 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2593 // used.
2594 AndExec = false;
2595 }
2596
2597 unsigned BrOp =
2598 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2599 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2600 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2601 SDLoc SL(N);
2602
2603 if (AndExec) {
2604 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2605 // analyzed what generates the vcc value, so we do not know whether vcc
2606 // bits for disabled lanes are 0. Thus we need to mask out bits for
2607 // disabled lanes.
2608 //
2609 // For the case that we select S_CBRANCH_SCC1 and it gets
2610 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2611 // SIInstrInfo::moveToVALU which inserts the S_AND).
2612 //
2613 // We could add an analysis of what generates the vcc value here and omit
2614 // the S_AND when is unnecessary. But it would be better to add a separate
2615 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2616 // catches both cases.
2617 Cond = SDValue(
2618 CurDAG->getMachineNode(
2619 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2620 MVT::i1,
2621 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2622 : AMDGPU::EXEC,
2623 MVT::i1),
2624 Cond),
2625 0);
2626 }
2627
2628 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2629 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2630 N->getOperand(2), // Basic Block
2631 VCC.getValue(0));
2632 }
2633
SelectFP_EXTEND(SDNode * N)2634 void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2635 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2636 !N->isDivergent()) {
2637 SDValue Src = N->getOperand(0);
2638 if (Src.getValueType() == MVT::f16) {
2639 if (isExtractHiElt(Src, Src)) {
2640 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2641 {Src});
2642 return;
2643 }
2644 }
2645 }
2646
2647 SelectCode(N);
2648 }
2649
SelectDSAppendConsume(SDNode * N,unsigned IntrID)2650 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2651 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2652 // be copied to an SGPR with readfirstlane.
2653 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2654 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2655
2656 SDValue Chain = N->getOperand(0);
2657 SDValue Ptr = N->getOperand(2);
2658 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2659 MachineMemOperand *MMO = M->getMemOperand();
2660 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2661
2662 SDValue Offset;
2663 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2664 SDValue PtrBase = Ptr.getOperand(0);
2665 SDValue PtrOffset = Ptr.getOperand(1);
2666
2667 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2668 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2669 N = glueCopyToM0(N, PtrBase);
2670 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2671 }
2672 }
2673
2674 if (!Offset) {
2675 N = glueCopyToM0(N, Ptr);
2676 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2677 }
2678
2679 SDValue Ops[] = {
2680 Offset,
2681 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2682 Chain,
2683 N->getOperand(N->getNumOperands() - 1) // New glue
2684 };
2685
2686 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2687 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2688 }
2689
2690 // We need to handle this here because tablegen doesn't support matching
2691 // instructions with multiple outputs.
SelectDSBvhStackIntrinsic(SDNode * N,unsigned IntrID)2692 void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2693 unsigned Opc;
2694 switch (IntrID) {
2695 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2696 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2697 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2698 break;
2699 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2700 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2701 break;
2702 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2703 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2704 break;
2705 }
2706 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2707 N->getOperand(5), N->getOperand(0)};
2708
2709 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2710 MachineMemOperand *MMO = M->getMemOperand();
2711 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2712 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2713 }
2714
gwsIntrinToOpcode(unsigned IntrID)2715 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2716 switch (IntrID) {
2717 case Intrinsic::amdgcn_ds_gws_init:
2718 return AMDGPU::DS_GWS_INIT;
2719 case Intrinsic::amdgcn_ds_gws_barrier:
2720 return AMDGPU::DS_GWS_BARRIER;
2721 case Intrinsic::amdgcn_ds_gws_sema_v:
2722 return AMDGPU::DS_GWS_SEMA_V;
2723 case Intrinsic::amdgcn_ds_gws_sema_br:
2724 return AMDGPU::DS_GWS_SEMA_BR;
2725 case Intrinsic::amdgcn_ds_gws_sema_p:
2726 return AMDGPU::DS_GWS_SEMA_P;
2727 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2728 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2729 default:
2730 llvm_unreachable("not a gws intrinsic");
2731 }
2732 }
2733
SelectDS_GWS(SDNode * N,unsigned IntrID)2734 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2735 if (!Subtarget->hasGWS() ||
2736 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2737 !Subtarget->hasGWSSemaReleaseAll())) {
2738 // Let this error.
2739 SelectCode(N);
2740 return;
2741 }
2742
2743 // Chain, intrinsic ID, vsrc, offset
2744 const bool HasVSrc = N->getNumOperands() == 4;
2745 assert(HasVSrc || N->getNumOperands() == 3);
2746
2747 SDLoc SL(N);
2748 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2749 int ImmOffset = 0;
2750 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2751 MachineMemOperand *MMO = M->getMemOperand();
2752
2753 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2754 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2755
2756 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2757 // offset field) % 64. Some versions of the programming guide omit the m0
2758 // part, or claim it's from offset 0.
2759 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2760 // If we have a constant offset, try to use the 0 in m0 as the base.
2761 // TODO: Look into changing the default m0 initialization value. If the
2762 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2763 // the immediate offset.
2764 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2765 ImmOffset = ConstOffset->getZExtValue();
2766 } else {
2767 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2768 ImmOffset = BaseOffset.getConstantOperandVal(1);
2769 BaseOffset = BaseOffset.getOperand(0);
2770 }
2771
2772 // Prefer to do the shift in an SGPR since it should be possible to use m0
2773 // as the result directly. If it's already an SGPR, it will be eliminated
2774 // later.
2775 SDNode *SGPROffset
2776 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2777 BaseOffset);
2778 // Shift to offset in m0
2779 SDNode *M0Base
2780 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2781 SDValue(SGPROffset, 0),
2782 CurDAG->getTargetConstant(16, SL, MVT::i32));
2783 glueCopyToM0(N, SDValue(M0Base, 0));
2784 }
2785
2786 SDValue Chain = N->getOperand(0);
2787 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2788
2789 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2790 SmallVector<SDValue, 5> Ops;
2791 if (HasVSrc)
2792 Ops.push_back(N->getOperand(2));
2793 Ops.push_back(OffsetField);
2794 Ops.push_back(Chain);
2795
2796 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2797 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2798 }
2799
SelectInterpP1F16(SDNode * N)2800 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2801 if (Subtarget->getLDSBankCount() != 16) {
2802 // This is a single instruction with a pattern.
2803 SelectCode(N);
2804 return;
2805 }
2806
2807 SDLoc DL(N);
2808
2809 // This requires 2 instructions. It is possible to write a pattern to support
2810 // this, but the generated isel emitter doesn't correctly deal with multiple
2811 // output instructions using the same physical register input. The copy to m0
2812 // is incorrectly placed before the second instruction.
2813 //
2814 // TODO: Match source modifiers.
2815 //
2816 // def : Pat <
2817 // (int_amdgcn_interp_p1_f16
2818 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2819 // (i32 timm:$attrchan), (i32 timm:$attr),
2820 // (i1 timm:$high), M0),
2821 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2822 // timm:$attrchan, 0,
2823 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2824 // let Predicates = [has16BankLDS];
2825 // }
2826
2827 // 16 bank LDS
2828 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2829 N->getOperand(5), SDValue());
2830
2831 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2832
2833 SDNode *InterpMov =
2834 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2835 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2836 N->getOperand(3), // Attr
2837 N->getOperand(2), // Attrchan
2838 ToM0.getValue(1) // In glue
2839 });
2840
2841 SDNode *InterpP1LV =
2842 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2843 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2844 N->getOperand(1), // Src0
2845 N->getOperand(3), // Attr
2846 N->getOperand(2), // Attrchan
2847 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2848 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2849 N->getOperand(4), // high
2850 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2851 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2852 SDValue(InterpMov, 1)
2853 });
2854
2855 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2856 }
2857
SelectINTRINSIC_W_CHAIN(SDNode * N)2858 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2859 unsigned IntrID = N->getConstantOperandVal(1);
2860 switch (IntrID) {
2861 case Intrinsic::amdgcn_ds_append:
2862 case Intrinsic::amdgcn_ds_consume: {
2863 if (N->getValueType(0) != MVT::i32)
2864 break;
2865 SelectDSAppendConsume(N, IntrID);
2866 return;
2867 }
2868 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2869 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2870 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2871 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2872 SelectDSBvhStackIntrinsic(N, IntrID);
2873 return;
2874 case Intrinsic::amdgcn_init_whole_wave:
2875 CurDAG->getMachineFunction()
2876 .getInfo<SIMachineFunctionInfo>()
2877 ->setInitWholeWave();
2878 break;
2879 }
2880
2881 SelectCode(N);
2882 }
2883
SelectINTRINSIC_WO_CHAIN(SDNode * N)2884 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2885 unsigned IntrID = N->getConstantOperandVal(0);
2886 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2887 SDNode *ConvGlueNode = N->getGluedNode();
2888 if (ConvGlueNode) {
2889 // FIXME: Possibly iterate over multiple glue nodes?
2890 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2891 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2892 ConvGlueNode =
2893 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2894 MVT::Glue, SDValue(ConvGlueNode, 0));
2895 } else {
2896 ConvGlueNode = nullptr;
2897 }
2898 switch (IntrID) {
2899 case Intrinsic::amdgcn_wqm:
2900 Opcode = AMDGPU::WQM;
2901 break;
2902 case Intrinsic::amdgcn_softwqm:
2903 Opcode = AMDGPU::SOFT_WQM;
2904 break;
2905 case Intrinsic::amdgcn_wwm:
2906 case Intrinsic::amdgcn_strict_wwm:
2907 Opcode = AMDGPU::STRICT_WWM;
2908 break;
2909 case Intrinsic::amdgcn_strict_wqm:
2910 Opcode = AMDGPU::STRICT_WQM;
2911 break;
2912 case Intrinsic::amdgcn_interp_p1_f16:
2913 SelectInterpP1F16(N);
2914 return;
2915 case Intrinsic::amdgcn_permlane16_swap:
2916 case Intrinsic::amdgcn_permlane32_swap: {
2917 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2918 !Subtarget->hasPermlane16Swap()) ||
2919 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2920 !Subtarget->hasPermlane32Swap())) {
2921 SelectCode(N); // Hit the default error
2922 return;
2923 }
2924
2925 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2926 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2927 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2928
2929 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2930 if (ConvGlueNode)
2931 NewOps.push_back(SDValue(ConvGlueNode, 0));
2932
2933 bool FI = N->getConstantOperandVal(3);
2934 NewOps[2] = CurDAG->getTargetConstant(
2935 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
2936
2937 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2938 return;
2939 }
2940 default:
2941 SelectCode(N);
2942 break;
2943 }
2944
2945 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2946 SDValue Src = N->getOperand(1);
2947 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2948 }
2949
2950 if (ConvGlueNode) {
2951 SmallVector<SDValue, 4> NewOps(N->ops());
2952 NewOps.push_back(SDValue(ConvGlueNode, 0));
2953 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2954 }
2955 }
2956
SelectINTRINSIC_VOID(SDNode * N)2957 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2958 unsigned IntrID = N->getConstantOperandVal(1);
2959 switch (IntrID) {
2960 case Intrinsic::amdgcn_ds_gws_init:
2961 case Intrinsic::amdgcn_ds_gws_barrier:
2962 case Intrinsic::amdgcn_ds_gws_sema_v:
2963 case Intrinsic::amdgcn_ds_gws_sema_br:
2964 case Intrinsic::amdgcn_ds_gws_sema_p:
2965 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2966 SelectDS_GWS(N, IntrID);
2967 return;
2968 default:
2969 break;
2970 }
2971
2972 SelectCode(N);
2973 }
2974
SelectWAVE_ADDRESS(SDNode * N)2975 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2976 SDValue Log2WaveSize =
2977 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2978 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2979 {N->getOperand(0), Log2WaveSize});
2980 }
2981
SelectSTACKRESTORE(SDNode * N)2982 void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2983 SDValue SrcVal = N->getOperand(1);
2984 if (SrcVal.getValueType() != MVT::i32) {
2985 SelectCode(N); // Emit default error
2986 return;
2987 }
2988
2989 SDValue CopyVal;
2990 Register SP = TLI->getStackPointerRegisterToSaveRestore();
2991 SDLoc SL(N);
2992
2993 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2994 CopyVal = SrcVal.getOperand(0);
2995 } else {
2996 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2997 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2998
2999 if (N->isDivergent()) {
3000 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3001 MVT::i32, SrcVal),
3002 0);
3003 }
3004
3005 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3006 {SrcVal, Log2WaveSize}),
3007 0);
3008 }
3009
3010 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3011 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3012 }
3013
SelectVOP3ModsImpl(SDValue In,SDValue & Src,unsigned & Mods,bool IsCanonicalizing,bool AllowAbs) const3014 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3015 unsigned &Mods,
3016 bool IsCanonicalizing,
3017 bool AllowAbs) const {
3018 Mods = SISrcMods::NONE;
3019 Src = In;
3020
3021 if (Src.getOpcode() == ISD::FNEG) {
3022 Mods |= SISrcMods::NEG;
3023 Src = Src.getOperand(0);
3024 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3025 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3026 // denormal mode, but we're implicitly canonicalizing in a source operand.
3027 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3028 if (LHS && LHS->isZero()) {
3029 Mods |= SISrcMods::NEG;
3030 Src = Src.getOperand(1);
3031 }
3032 }
3033
3034 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3035 Mods |= SISrcMods::ABS;
3036 Src = Src.getOperand(0);
3037 }
3038
3039 return true;
3040 }
3041
SelectVOP3Mods(SDValue In,SDValue & Src,SDValue & SrcMods) const3042 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3043 SDValue &SrcMods) const {
3044 unsigned Mods;
3045 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3046 /*AllowAbs=*/true)) {
3047 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3048 return true;
3049 }
3050
3051 return false;
3052 }
3053
SelectVOP3ModsNonCanonicalizing(SDValue In,SDValue & Src,SDValue & SrcMods) const3054 bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3055 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3056 unsigned Mods;
3057 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3058 /*AllowAbs=*/true)) {
3059 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3060 return true;
3061 }
3062
3063 return false;
3064 }
3065
SelectVOP3BMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3066 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3067 SDValue &SrcMods) const {
3068 unsigned Mods;
3069 if (SelectVOP3ModsImpl(In, Src, Mods,
3070 /*IsCanonicalizing=*/true,
3071 /*AllowAbs=*/false)) {
3072 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3073 return true;
3074 }
3075
3076 return false;
3077 }
3078
SelectVOP3NoMods(SDValue In,SDValue & Src) const3079 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3080 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3081 return false;
3082
3083 Src = In;
3084 return true;
3085 }
3086
SelectVINTERPModsImpl(SDValue In,SDValue & Src,SDValue & SrcMods,bool OpSel) const3087 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3088 SDValue &SrcMods,
3089 bool OpSel) const {
3090 unsigned Mods;
3091 if (SelectVOP3ModsImpl(In, Src, Mods,
3092 /*IsCanonicalizing=*/true,
3093 /*AllowAbs=*/false)) {
3094 if (OpSel)
3095 Mods |= SISrcMods::OP_SEL_0;
3096 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3097 return true;
3098 }
3099
3100 return false;
3101 }
3102
SelectVINTERPMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3103 bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3104 SDValue &SrcMods) const {
3105 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3106 }
3107
SelectVINTERPModsHi(SDValue In,SDValue & Src,SDValue & SrcMods) const3108 bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3109 SDValue &SrcMods) const {
3110 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3111 }
3112
SelectVOP3Mods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const3113 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3114 SDValue &SrcMods, SDValue &Clamp,
3115 SDValue &Omod) const {
3116 SDLoc DL(In);
3117 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3118 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3119
3120 return SelectVOP3Mods(In, Src, SrcMods);
3121 }
3122
SelectVOP3BMods0(SDValue In,SDValue & Src,SDValue & SrcMods,SDValue & Clamp,SDValue & Omod) const3123 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3124 SDValue &SrcMods, SDValue &Clamp,
3125 SDValue &Omod) const {
3126 SDLoc DL(In);
3127 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3128 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3129
3130 return SelectVOP3BMods(In, Src, SrcMods);
3131 }
3132
SelectVOP3OMods(SDValue In,SDValue & Src,SDValue & Clamp,SDValue & Omod) const3133 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3134 SDValue &Clamp, SDValue &Omod) const {
3135 Src = In;
3136
3137 SDLoc DL(In);
3138 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3139 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3140
3141 return true;
3142 }
3143
SelectVOP3PMods(SDValue In,SDValue & Src,SDValue & SrcMods,bool IsDOT) const3144 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3145 SDValue &SrcMods, bool IsDOT) const {
3146 unsigned Mods = SISrcMods::NONE;
3147 Src = In;
3148
3149 // TODO: Handle G_FSUB 0 as fneg
3150 if (Src.getOpcode() == ISD::FNEG) {
3151 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3152 Src = Src.getOperand(0);
3153 }
3154
3155 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3156 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3157 unsigned VecMods = Mods;
3158
3159 SDValue Lo = stripBitcast(Src.getOperand(0));
3160 SDValue Hi = stripBitcast(Src.getOperand(1));
3161
3162 if (Lo.getOpcode() == ISD::FNEG) {
3163 Lo = stripBitcast(Lo.getOperand(0));
3164 Mods ^= SISrcMods::NEG;
3165 }
3166
3167 if (Hi.getOpcode() == ISD::FNEG) {
3168 Hi = stripBitcast(Hi.getOperand(0));
3169 Mods ^= SISrcMods::NEG_HI;
3170 }
3171
3172 if (isExtractHiElt(Lo, Lo))
3173 Mods |= SISrcMods::OP_SEL_0;
3174
3175 if (isExtractHiElt(Hi, Hi))
3176 Mods |= SISrcMods::OP_SEL_1;
3177
3178 unsigned VecSize = Src.getValueSizeInBits();
3179 Lo = stripExtractLoElt(Lo);
3180 Hi = stripExtractLoElt(Hi);
3181
3182 if (Lo.getValueSizeInBits() > VecSize) {
3183 Lo = CurDAG->getTargetExtractSubreg(
3184 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3185 MVT::getIntegerVT(VecSize), Lo);
3186 }
3187
3188 if (Hi.getValueSizeInBits() > VecSize) {
3189 Hi = CurDAG->getTargetExtractSubreg(
3190 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3191 MVT::getIntegerVT(VecSize), Hi);
3192 }
3193
3194 assert(Lo.getValueSizeInBits() <= VecSize &&
3195 Hi.getValueSizeInBits() <= VecSize);
3196
3197 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3198 // Really a scalar input. Just select from the low half of the register to
3199 // avoid packing.
3200
3201 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3202 Src = Lo;
3203 } else {
3204 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3205
3206 SDLoc SL(In);
3207 SDValue Undef = SDValue(
3208 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3209 Lo.getValueType()), 0);
3210 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3211 : AMDGPU::SReg_64RegClassID;
3212 const SDValue Ops[] = {
3213 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3214 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3215 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3216
3217 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3218 Src.getValueType(), Ops), 0);
3219 }
3220 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3221 return true;
3222 }
3223
3224 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3225 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3226 .bitcastToAPInt().getZExtValue();
3227 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3228 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3229 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3230 return true;
3231 }
3232 }
3233
3234 Mods = VecMods;
3235 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3236 Src.getNumOperands() == 2) {
3237
3238 // TODO: We should repeat the build_vector source check above for the
3239 // vector_shuffle for negates and casts of individual elements.
3240
3241 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3242 ArrayRef<int> Mask = SVN->getMask();
3243
3244 if (Mask[0] < 2 && Mask[1] < 2) {
3245 // src1 should be undef.
3246 SDValue ShuffleSrc = SVN->getOperand(0);
3247
3248 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3249 ShuffleSrc = ShuffleSrc.getOperand(0);
3250 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3251 }
3252
3253 if (Mask[0] == 1)
3254 Mods |= SISrcMods::OP_SEL_0;
3255 if (Mask[1] == 1)
3256 Mods |= SISrcMods::OP_SEL_1;
3257
3258 Src = ShuffleSrc;
3259 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3260 return true;
3261 }
3262 }
3263
3264 // Packed instructions do not have abs modifiers.
3265 Mods |= SISrcMods::OP_SEL_1;
3266
3267 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3268 return true;
3269 }
3270
SelectVOP3PModsDOT(SDValue In,SDValue & Src,SDValue & SrcMods) const3271 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3272 SDValue &SrcMods) const {
3273 return SelectVOP3PMods(In, Src, SrcMods, true);
3274 }
3275
SelectVOP3PModsNeg(SDValue In,SDValue & Src) const3276 bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3277 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3278 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3279 // 1 promotes packed values to signed, 0 treats them as unsigned.
3280 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3281
3282 unsigned Mods = SISrcMods::OP_SEL_1;
3283 unsigned SrcSign = C->getZExtValue();
3284 if (SrcSign == 1)
3285 Mods ^= SISrcMods::NEG;
3286
3287 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3288 return true;
3289 }
3290
SelectWMMAOpSelVOP3PMods(SDValue In,SDValue & Src) const3291 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3292 SDValue &Src) const {
3293 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3294 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3295
3296 unsigned Mods = SISrcMods::OP_SEL_1;
3297 unsigned SrcVal = C->getZExtValue();
3298 if (SrcVal == 1)
3299 Mods |= SISrcMods::OP_SEL_0;
3300
3301 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3302 return true;
3303 }
3304
buildRegSequence32(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL)3305 static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3306 llvm::SelectionDAG *CurDAG,
3307 const SDLoc &DL) {
3308 unsigned DstRegClass;
3309 EVT DstTy;
3310 switch (Elts.size()) {
3311 case 8:
3312 DstRegClass = AMDGPU::VReg_256RegClassID;
3313 DstTy = MVT::v8i32;
3314 break;
3315 case 4:
3316 DstRegClass = AMDGPU::VReg_128RegClassID;
3317 DstTy = MVT::v4i32;
3318 break;
3319 case 2:
3320 DstRegClass = AMDGPU::VReg_64RegClassID;
3321 DstTy = MVT::v2i32;
3322 break;
3323 default:
3324 llvm_unreachable("unhandled Reg sequence size");
3325 }
3326
3327 SmallVector<SDValue, 17> Ops;
3328 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3329 for (unsigned i = 0; i < Elts.size(); ++i) {
3330 Ops.push_back(Elts[i]);
3331 Ops.push_back(CurDAG->getTargetConstant(
3332 SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3333 }
3334 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3335 }
3336
buildRegSequence16(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL)3337 static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3338 llvm::SelectionDAG *CurDAG,
3339 const SDLoc &DL) {
3340 SmallVector<SDValue, 8> PackedElts;
3341 assert("unhandled Reg sequence size" &&
3342 (Elts.size() == 8 || Elts.size() == 16));
3343
3344 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3345 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3346 for (unsigned i = 0; i < Elts.size(); i += 2) {
3347 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3348 SDValue HiSrc;
3349 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3350 PackedElts.push_back(HiSrc);
3351 } else {
3352 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3353 MachineSDNode *Packed =
3354 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3355 {Elts[i + 1], Elts[i], PackLoLo});
3356 PackedElts.push_back(SDValue(Packed, 0));
3357 }
3358 }
3359
3360 return buildRegSequence32(PackedElts, CurDAG, DL);
3361 }
3362
buildRegSequence(SmallVectorImpl<SDValue> & Elts,llvm::SelectionDAG * CurDAG,const SDLoc & DL,unsigned ElementSize)3363 static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3364 llvm::SelectionDAG *CurDAG,
3365 const SDLoc &DL, unsigned ElementSize) {
3366 if (ElementSize == 16)
3367 return buildRegSequence16(Elts, CurDAG, DL);
3368 if (ElementSize == 32)
3369 return buildRegSequence32(Elts, CurDAG, DL);
3370 llvm_unreachable("Unhandled element size");
3371 }
3372
selectWMMAModsNegAbs(unsigned ModOpcode,unsigned & Mods,SmallVectorImpl<SDValue> & Elts,SDValue & Src,llvm::SelectionDAG * CurDAG,const SDLoc & DL,unsigned ElementSize)3373 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3374 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3375 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3376 unsigned ElementSize) {
3377 if (ModOpcode == ISD::FNEG) {
3378 Mods |= SISrcMods::NEG;
3379 // Check if all elements also have abs modifier
3380 SmallVector<SDValue, 8> NegAbsElts;
3381 for (auto El : Elts) {
3382 if (El.getOpcode() != ISD::FABS)
3383 break;
3384 NegAbsElts.push_back(El->getOperand(0));
3385 }
3386 if (Elts.size() != NegAbsElts.size()) {
3387 // Neg
3388 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3389 } else {
3390 // Neg and Abs
3391 Mods |= SISrcMods::NEG_HI;
3392 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3393 }
3394 } else {
3395 assert(ModOpcode == ISD::FABS);
3396 // Abs
3397 Mods |= SISrcMods::NEG_HI;
3398 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3399 }
3400 }
3401
3402 // Check all f16 elements for modifiers while looking through b32 and v2b16
3403 // build vector, stop if element does not satisfy ModifierCheck.
3404 static void
checkWMMAElementsModifiersF16(BuildVectorSDNode * BV,std::function<bool (SDValue)> ModifierCheck)3405 checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3406 std::function<bool(SDValue)> ModifierCheck) {
3407 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3408 if (auto *F16Pair =
3409 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3410 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3411 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3412 if (!ModifierCheck(ElF16))
3413 break;
3414 }
3415 }
3416 }
3417 }
3418
SelectWMMAModsF16Neg(SDValue In,SDValue & Src,SDValue & SrcMods) const3419 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3420 SDValue &SrcMods) const {
3421 Src = In;
3422 unsigned Mods = SISrcMods::OP_SEL_1;
3423
3424 // mods are on f16 elements
3425 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3426 SmallVector<SDValue, 8> EltsF16;
3427
3428 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3429 if (Element.getOpcode() != ISD::FNEG)
3430 return false;
3431 EltsF16.push_back(Element.getOperand(0));
3432 return true;
3433 });
3434
3435 // All elements have neg modifier
3436 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3437 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3438 Mods |= SISrcMods::NEG;
3439 Mods |= SISrcMods::NEG_HI;
3440 }
3441 }
3442
3443 // mods are on v2f16 elements
3444 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3445 SmallVector<SDValue, 8> EltsV2F16;
3446 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3447 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3448 // Based on first element decide which mod we match, neg or abs
3449 if (ElV2f16.getOpcode() != ISD::FNEG)
3450 break;
3451 EltsV2F16.push_back(ElV2f16.getOperand(0));
3452 }
3453
3454 // All pairs of elements have neg modifier
3455 if (BV->getNumOperands() == EltsV2F16.size()) {
3456 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3457 Mods |= SISrcMods::NEG;
3458 Mods |= SISrcMods::NEG_HI;
3459 }
3460 }
3461
3462 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3463 return true;
3464 }
3465
SelectWMMAModsF16NegAbs(SDValue In,SDValue & Src,SDValue & SrcMods) const3466 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3467 SDValue &SrcMods) const {
3468 Src = In;
3469 unsigned Mods = SISrcMods::OP_SEL_1;
3470 unsigned ModOpcode;
3471
3472 // mods are on f16 elements
3473 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3474 SmallVector<SDValue, 8> EltsF16;
3475 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3476 // Based on first element decide which mod we match, neg or abs
3477 if (EltsF16.empty())
3478 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3479 if (ElF16.getOpcode() != ModOpcode)
3480 return false;
3481 EltsF16.push_back(ElF16.getOperand(0));
3482 return true;
3483 });
3484
3485 // All elements have ModOpcode modifier
3486 if (BV->getNumOperands() * 2 == EltsF16.size())
3487 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3488 16);
3489 }
3490
3491 // mods are on v2f16 elements
3492 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3493 SmallVector<SDValue, 8> EltsV2F16;
3494
3495 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3496 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3497 // Based on first element decide which mod we match, neg or abs
3498 if (EltsV2F16.empty())
3499 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3500 if (ElV2f16->getOpcode() != ModOpcode)
3501 break;
3502 EltsV2F16.push_back(ElV2f16->getOperand(0));
3503 }
3504
3505 // All elements have ModOpcode modifier
3506 if (BV->getNumOperands() == EltsV2F16.size())
3507 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3508 32);
3509 }
3510
3511 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3512 return true;
3513 }
3514
SelectWMMAModsF32NegAbs(SDValue In,SDValue & Src,SDValue & SrcMods) const3515 bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3516 SDValue &SrcMods) const {
3517 Src = In;
3518 unsigned Mods = SISrcMods::OP_SEL_1;
3519 SmallVector<SDValue, 8> EltsF32;
3520
3521 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3522 assert(BV->getNumOperands() > 0);
3523 // Based on first element decide which mod we match, neg or abs
3524 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3525 unsigned ModOpcode =
3526 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3527 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3528 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3529 if (ElF32.getOpcode() != ModOpcode)
3530 break;
3531 EltsF32.push_back(ElF32.getOperand(0));
3532 }
3533
3534 // All elements had ModOpcode modifier
3535 if (BV->getNumOperands() == EltsF32.size())
3536 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3537 32);
3538 }
3539
3540 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3541 return true;
3542 }
3543
SelectWMMAVISrc(SDValue In,SDValue & Src) const3544 bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3545 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3546 BitVector UndefElements;
3547 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3548 if (isInlineImmediate(Splat.getNode())) {
3549 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3550 unsigned Imm = C->getAPIntValue().getSExtValue();
3551 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3552 return true;
3553 }
3554 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3555 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3556 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3557 return true;
3558 }
3559 llvm_unreachable("unhandled Constant node");
3560 }
3561 }
3562
3563 // 16 bit splat
3564 SDValue SplatSrc32 = stripBitcast(In);
3565 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3566 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3567 SDValue SplatSrc16 = stripBitcast(Splat32);
3568 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3569 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3570 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3571 std::optional<APInt> RawValue;
3572 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3573 RawValue = C->getValueAPF().bitcastToAPInt();
3574 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3575 RawValue = C->getAPIntValue();
3576
3577 if (RawValue.has_value()) {
3578 EVT VT = In.getValueType().getScalarType();
3579 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3580 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3581 ? APFloatBase::IEEEhalf()
3582 : APFloatBase::BFloat(),
3583 RawValue.value());
3584 if (TII->isInlineConstant(FloatVal)) {
3585 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3586 MVT::i16);
3587 return true;
3588 }
3589 } else if (VT.getSimpleVT() == MVT::i16) {
3590 if (TII->isInlineConstant(RawValue.value())) {
3591 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3592 MVT::i16);
3593 return true;
3594 }
3595 } else
3596 llvm_unreachable("unknown 16-bit type");
3597 }
3598 }
3599 }
3600
3601 return false;
3602 }
3603
SelectSWMMACIndex8(SDValue In,SDValue & Src,SDValue & IndexKey) const3604 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3605 SDValue &IndexKey) const {
3606 unsigned Key = 0;
3607 Src = In;
3608
3609 if (In.getOpcode() == ISD::SRL) {
3610 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3611 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3612 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3613 ShiftAmt->getZExtValue() % 8 == 0) {
3614 Key = ShiftAmt->getZExtValue() / 8;
3615 Src = ShiftSrc;
3616 }
3617 }
3618
3619 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3620 return true;
3621 }
3622
SelectSWMMACIndex16(SDValue In,SDValue & Src,SDValue & IndexKey) const3623 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3624 SDValue &IndexKey) const {
3625 unsigned Key = 0;
3626 Src = In;
3627
3628 if (In.getOpcode() == ISD::SRL) {
3629 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3630 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3631 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3632 ShiftAmt->getZExtValue() == 16) {
3633 Key = 1;
3634 Src = ShiftSrc;
3635 }
3636 }
3637
3638 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3639 return true;
3640 }
3641
SelectVOP3OpSel(SDValue In,SDValue & Src,SDValue & SrcMods) const3642 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3643 SDValue &SrcMods) const {
3644 Src = In;
3645 // FIXME: Handle op_sel
3646 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3647 return true;
3648 }
3649
SelectVOP3OpSelMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3650 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3651 SDValue &SrcMods) const {
3652 // FIXME: Handle op_sel
3653 return SelectVOP3Mods(In, Src, SrcMods);
3654 }
3655
3656 // The return value is not whether the match is possible (which it always is),
3657 // but whether or not it a conversion is really used.
SelectVOP3PMadMixModsImpl(SDValue In,SDValue & Src,unsigned & Mods) const3658 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3659 unsigned &Mods) const {
3660 Mods = 0;
3661 SelectVOP3ModsImpl(In, Src, Mods);
3662
3663 if (Src.getOpcode() == ISD::FP_EXTEND) {
3664 Src = Src.getOperand(0);
3665 assert(Src.getValueType() == MVT::f16);
3666 Src = stripBitcast(Src);
3667
3668 // Be careful about folding modifiers if we already have an abs. fneg is
3669 // applied last, so we don't want to apply an earlier fneg.
3670 if ((Mods & SISrcMods::ABS) == 0) {
3671 unsigned ModsTmp;
3672 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3673
3674 if ((ModsTmp & SISrcMods::NEG) != 0)
3675 Mods ^= SISrcMods::NEG;
3676
3677 if ((ModsTmp & SISrcMods::ABS) != 0)
3678 Mods |= SISrcMods::ABS;
3679 }
3680
3681 // op_sel/op_sel_hi decide the source type and source.
3682 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3683 // If the sources's op_sel is set, it picks the high half of the source
3684 // register.
3685
3686 Mods |= SISrcMods::OP_SEL_1;
3687 if (isExtractHiElt(Src, Src)) {
3688 Mods |= SISrcMods::OP_SEL_0;
3689
3690 // TODO: Should we try to look for neg/abs here?
3691 }
3692
3693 // Prevent unnecessary subreg COPY to VGPR_16
3694 if (Src.getOpcode() == ISD::TRUNCATE &&
3695 Src.getOperand(0).getValueType() == MVT::i32) {
3696 Src = Src.getOperand(0);
3697 }
3698 return true;
3699 }
3700
3701 return false;
3702 }
3703
SelectVOP3PMadMixModsExt(SDValue In,SDValue & Src,SDValue & SrcMods) const3704 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3705 SDValue &SrcMods) const {
3706 unsigned Mods = 0;
3707 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3708 return false;
3709 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3710 return true;
3711 }
3712
SelectVOP3PMadMixMods(SDValue In,SDValue & Src,SDValue & SrcMods) const3713 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3714 SDValue &SrcMods) const {
3715 unsigned Mods = 0;
3716 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3717 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3718 return true;
3719 }
3720
3721 // Match BITOP3 operation and return a number of matched instructions plus
3722 // truth table.
BitOp3_Op(SDValue In,SmallVectorImpl<SDValue> & Src)3723 static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3724 SmallVectorImpl<SDValue> &Src) {
3725 unsigned NumOpcodes = 0;
3726 uint8_t LHSBits, RHSBits;
3727
3728 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3729 // Define truth table given Src0, Src1, Src2 bits permutations:
3730 // 0 0 0
3731 // 0 0 1
3732 // 0 1 0
3733 // 0 1 1
3734 // 1 0 0
3735 // 1 0 1
3736 // 1 1 0
3737 // 1 1 1
3738 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3739
3740 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3741 if (C->isAllOnes()) {
3742 Bits = 0xff;
3743 return true;
3744 }
3745 if (C->isZero()) {
3746 Bits = 0;
3747 return true;
3748 }
3749 }
3750
3751 for (unsigned I = 0; I < Src.size(); ++I) {
3752 // Try to find existing reused operand
3753 if (Src[I] == Op) {
3754 Bits = SrcBits[I];
3755 return true;
3756 }
3757 // Try to replace parent operator
3758 if (Src[I] == In) {
3759 Bits = SrcBits[I];
3760 Src[I] = Op;
3761 return true;
3762 }
3763 }
3764
3765 if (Src.size() == 3) {
3766 // No room left for operands. Try one last time, there can be a 'not' of
3767 // one of our source operands. In this case we can compute the bits
3768 // without growing Src vector.
3769 if (Op.getOpcode() == ISD::XOR) {
3770 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3771 if (C->isAllOnes()) {
3772 SDValue LHS = Op.getOperand(0);
3773 for (unsigned I = 0; I < Src.size(); ++I) {
3774 if (Src[I] == LHS) {
3775 Bits = ~SrcBits[I];
3776 return true;
3777 }
3778 }
3779 }
3780 }
3781 }
3782
3783 return false;
3784 }
3785
3786 Bits = SrcBits[Src.size()];
3787 Src.push_back(Op);
3788 return true;
3789 };
3790
3791 switch (In.getOpcode()) {
3792 case ISD::AND:
3793 case ISD::OR:
3794 case ISD::XOR: {
3795 SDValue LHS = In.getOperand(0);
3796 SDValue RHS = In.getOperand(1);
3797
3798 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3799 if (!getOperandBits(LHS, LHSBits) ||
3800 !getOperandBits(RHS, RHSBits)) {
3801 Src = Backup;
3802 return std::make_pair(0, 0);
3803 }
3804
3805 // Recursion is naturally limited by the size of the operand vector.
3806 auto Op = BitOp3_Op(LHS, Src);
3807 if (Op.first) {
3808 NumOpcodes += Op.first;
3809 LHSBits = Op.second;
3810 }
3811
3812 Op = BitOp3_Op(RHS, Src);
3813 if (Op.first) {
3814 NumOpcodes += Op.first;
3815 RHSBits = Op.second;
3816 }
3817 break;
3818 }
3819 default:
3820 return std::make_pair(0, 0);
3821 }
3822
3823 uint8_t TTbl;
3824 switch (In.getOpcode()) {
3825 case ISD::AND:
3826 TTbl = LHSBits & RHSBits;
3827 break;
3828 case ISD::OR:
3829 TTbl = LHSBits | RHSBits;
3830 break;
3831 case ISD::XOR:
3832 TTbl = LHSBits ^ RHSBits;
3833 break;
3834 default:
3835 break;
3836 }
3837
3838 return std::make_pair(NumOpcodes + 1, TTbl);
3839 }
3840
SelectBITOP3(SDValue In,SDValue & Src0,SDValue & Src1,SDValue & Src2,SDValue & Tbl) const3841 bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3842 SDValue &Src2, SDValue &Tbl) const {
3843 SmallVector<SDValue, 3> Src;
3844 uint8_t TTbl;
3845 unsigned NumOpcodes;
3846
3847 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3848
3849 // Src.empty() case can happen if all operands are all zero or all ones.
3850 // Normally it shall be optimized out before reaching this.
3851 if (NumOpcodes < 2 || Src.empty())
3852 return false;
3853
3854 // For a uniform case threshold should be higher to account for moves between
3855 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3856 // and a readtfirstlane after.
3857 if (NumOpcodes < 4 && !In->isDivergent())
3858 return false;
3859
3860 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3861 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3862 // asm more readable. This cannot be modeled with AddedComplexity because
3863 // selector does not know how many operations did we match.
3864 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3865 (In.getOperand(0).getOpcode() == In.getOpcode() ||
3866 In.getOperand(1).getOpcode() == In.getOpcode()))
3867 return false;
3868
3869 if (In.getOpcode() == ISD::OR &&
3870 (In.getOperand(0).getOpcode() == ISD::AND ||
3871 In.getOperand(1).getOpcode() == ISD::AND))
3872 return false;
3873 }
3874
3875 // Last operand can be ignored, turning a ternary operation into a binary.
3876 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3877 // 'c' with 'a' here without changing the answer. In some pathological
3878 // cases it should be possible to get an operation with a single operand
3879 // too if optimizer would not catch it.
3880 while (Src.size() < 3)
3881 Src.push_back(Src[0]);
3882
3883 Src0 = Src[0];
3884 Src1 = Src[1];
3885 Src2 = Src[2];
3886
3887 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3888 return true;
3889 }
3890
getHi16Elt(SDValue In) const3891 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3892 if (In.isUndef())
3893 return CurDAG->getUNDEF(MVT::i32);
3894
3895 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3896 SDLoc SL(In);
3897 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3898 }
3899
3900 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3901 SDLoc SL(In);
3902 return CurDAG->getConstant(
3903 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3904 }
3905
3906 SDValue Src;
3907 if (isExtractHiElt(In, Src))
3908 return Src;
3909
3910 return SDValue();
3911 }
3912
isVGPRImm(const SDNode * N) const3913 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3914 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
3915
3916 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
3917 const SIInstrInfo *SII = Subtarget->getInstrInfo();
3918
3919 unsigned Limit = 0;
3920 bool AllUsesAcceptSReg = true;
3921 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3922 Limit < 10 && U != E; ++U, ++Limit) {
3923 const TargetRegisterClass *RC =
3924 getOperandRegClass(U->getUser(), U->getOperandNo());
3925
3926 // If the register class is unknown, it could be an unknown
3927 // register class that needs to be an SGPR, e.g. an inline asm
3928 // constraint
3929 if (!RC || SIRI->isSGPRClass(RC))
3930 return false;
3931
3932 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3933 AllUsesAcceptSReg = false;
3934 SDNode *User = U->getUser();
3935 if (User->isMachineOpcode()) {
3936 unsigned Opc = User->getMachineOpcode();
3937 const MCInstrDesc &Desc = SII->get(Opc);
3938 if (Desc.isCommutable()) {
3939 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3940 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3941 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3942 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3943 const TargetRegisterClass *CommutedRC =
3944 getOperandRegClass(U->getUser(), CommutedOpNo);
3945 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3946 CommutedRC == &AMDGPU::VS_64RegClass)
3947 AllUsesAcceptSReg = true;
3948 }
3949 }
3950 }
3951 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3952 // commuting current user. This means have at least one use
3953 // that strictly require VGPR. Thus, we will not attempt to commute
3954 // other user instructions.
3955 if (!AllUsesAcceptSReg)
3956 break;
3957 }
3958 }
3959 return !AllUsesAcceptSReg && (Limit < 10);
3960 }
3961
isUniformLoad(const SDNode * N) const3962 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3963 const auto *Ld = cast<LoadSDNode>(N);
3964
3965 const MachineMemOperand *MMO = Ld->getMemOperand();
3966 if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
3967 return false;
3968
3969 return MMO->getSize().hasValue() &&
3970 Ld->getAlign() >=
3971 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3972 uint64_t(4))) &&
3973 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3974 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3975 (Subtarget->getScalarizeGlobalBehavior() &&
3976 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3977 Ld->isSimple() &&
3978 static_cast<const SITargetLowering *>(getTargetLowering())
3979 ->isMemOpHasNoClobberedMemOperand(N)));
3980 }
3981
PostprocessISelDAG()3982 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3983 const AMDGPUTargetLowering& Lowering =
3984 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3985 bool IsModified = false;
3986 do {
3987 IsModified = false;
3988
3989 // Go over all selected nodes and try to fold them a bit more
3990 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3991 while (Position != CurDAG->allnodes_end()) {
3992 SDNode *Node = &*Position++;
3993 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3994 if (!MachineNode)
3995 continue;
3996
3997 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3998 if (ResNode != Node) {
3999 if (ResNode)
4000 ReplaceUses(Node, ResNode);
4001 IsModified = true;
4002 }
4003 }
4004 CurDAG->RemoveDeadNodes();
4005 } while (IsModified);
4006 }
4007
AMDGPUDAGToDAGISelLegacy(TargetMachine & TM,CodeGenOptLevel OptLevel)4008 AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
4009 CodeGenOptLevel OptLevel)
4010 : SelectionDAGISelLegacy(
4011 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
4012
4013 char AMDGPUDAGToDAGISelLegacy::ID = 0;
4014