1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/FloatingPointMode.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/UniformityAnalysis.h"
27 #include "llvm/CodeGen/Analysis.h"
28 #include "llvm/CodeGen/ByteProvider.h"
29 #include "llvm/CodeGen/FunctionLoweringInfo.h"
30 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/IntrinsicsAMDGPU.h"
40 #include "llvm/IR/IntrinsicsR600.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/KnownBits.h"
44 #include "llvm/Support/ModRef.h"
45 #include "llvm/Transforms/Utils/LowerAtomic.h"
46 #include <optional>
47
48 using namespace llvm;
49
50 #define DEBUG_TYPE "si-lower"
51
52 STATISTIC(NumTailCalls, "Number of tail calls");
53
54 static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
59 static cl::opt<bool> UseDivergentRegisterIndexing(
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
64 // TODO: This option should be removed once we switch to always using PTRADD in
65 // the SelectionDAG.
66 static cl::opt<bool> UseSelectionDAGPTRADD(
67 "amdgpu-use-sdag-ptradd", cl::Hidden,
68 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
69 "SelectionDAG ISel"),
70 cl::init(false));
71
denormalModeIsFlushAllF32(const MachineFunction & MF)72 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
73 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
74 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
75 }
76
denormalModeIsFlushAllF64F16(const MachineFunction & MF)77 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
78 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
79 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
80 }
81
findFirstFreeSGPR(CCState & CCInfo)82 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
83 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
84 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
85 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
86 return AMDGPU::SGPR0 + Reg;
87 }
88 }
89 llvm_unreachable("Cannot allocate sgpr");
90 }
91
SITargetLowering(const TargetMachine & TM,const GCNSubtarget & STI)92 SITargetLowering::SITargetLowering(const TargetMachine &TM,
93 const GCNSubtarget &STI)
94 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
95 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
96 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
97
98 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
99 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
100
101 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
102
103 const SIRegisterInfo *TRI = STI.getRegisterInfo();
104 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
105
106 addRegisterClass(MVT::f64, V64RegClass);
107 addRegisterClass(MVT::v2f32, V64RegClass);
108 addRegisterClass(MVT::Untyped, V64RegClass);
109
110 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
111 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
112
113 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
114 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
115
116 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
118
119 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
120 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
121
122 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
123 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
124
125 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
129 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
130
131 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
132 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
133
134 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
138 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
139
140 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
141 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
142
143 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
144 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
145
146 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
147 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
148
149 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
150 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
151
152 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
156 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
157
158 if (Subtarget->has16BitInsts()) {
159 if (Subtarget->useRealTrue16Insts()) {
160 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
161 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
162 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
163 } else {
164 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
167 }
168
169 // Unless there are also VOP3P operations, not operations are really legal.
170 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
174 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
175 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
177 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
178 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
180 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
181 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
183 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
184 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
185 }
186
187 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
188 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
189
190 computeRegisterProperties(Subtarget->getRegisterInfo());
191
192 // The boolean content concept here is too inflexible. Compares only ever
193 // really produce a 1-bit result. Any copy/extend from these will turn into a
194 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
195 // it's what most targets use.
196 setBooleanContents(ZeroOrOneBooleanContent);
197 setBooleanVectorContents(ZeroOrOneBooleanContent);
198
199 // We need to custom lower vector stores from local memory
200 setOperationAction(ISD::LOAD,
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 setOperationAction(ISD::STORE,
208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
212 Custom);
213
214 if (isTypeLegal(MVT::bf16)) {
215 for (unsigned Opc :
216 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
217 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
218 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
219 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
220 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
221 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
222 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
223 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
224 ISD::SETCC}) {
225 // FIXME: The promoted to type shouldn't need to be explicit
226 setOperationAction(Opc, MVT::bf16, Promote);
227 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
228 }
229
230 setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
231
232 setOperationAction(ISD::SELECT, MVT::bf16, Promote);
233 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
234
235 setOperationAction(ISD::FABS, MVT::bf16, Legal);
236 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
237 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
238
239 // We only need to custom lower because we can't specify an action for bf16
240 // sources.
241 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
242 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
243 }
244
245 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
246 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
247 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
248 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
249 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
250 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
251 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
252 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
253 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
254 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
255 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
256 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
257 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
258 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
259 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
260 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
261
262 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
263 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
264 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
265 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
266 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
268 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
269
270 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
271
272 setOperationAction(ISD::SELECT, MVT::i1, Promote);
273 setOperationAction(ISD::SELECT, MVT::i64, Custom);
274 setOperationAction(ISD::SELECT, MVT::f64, Promote);
275 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
276
277 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
278
279 setOperationAction(ISD::SELECT_CC,
280 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
281
282 setOperationAction(ISD::SETCC, MVT::i1, Promote);
283 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
284 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
285
286 setOperationAction(ISD::TRUNCATE,
287 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
288 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
289 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
290 Expand);
291 setOperationAction(ISD::FP_ROUND,
292 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
293 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
294 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295 Expand);
296
297 setOperationAction(ISD::SIGN_EXTEND_INREG,
298 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
299 MVT::v3i16, MVT::v4i16, MVT::Other},
300 Custom);
301
302 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
303 setOperationAction(ISD::BR_CC,
304 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
305
306 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
307
308 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
309
310 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
311 Expand);
312
313 #if 0
314 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
315 #endif
316
317 // We only support LOAD/STORE and vector manipulation ops for vectors
318 // with > 4 elements.
319 for (MVT VT :
320 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
321 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
322 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
323 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
324 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
325 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
326 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
327 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
328 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
329 switch (Op) {
330 case ISD::LOAD:
331 case ISD::STORE:
332 case ISD::BUILD_VECTOR:
333 case ISD::BITCAST:
334 case ISD::UNDEF:
335 case ISD::EXTRACT_VECTOR_ELT:
336 case ISD::INSERT_VECTOR_ELT:
337 case ISD::SCALAR_TO_VECTOR:
338 case ISD::IS_FPCLASS:
339 break;
340 case ISD::EXTRACT_SUBVECTOR:
341 case ISD::INSERT_SUBVECTOR:
342 case ISD::CONCAT_VECTORS:
343 setOperationAction(Op, VT, Custom);
344 break;
345 default:
346 setOperationAction(Op, VT, Expand);
347 break;
348 }
349 }
350 }
351
352 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
353
354 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
355 // is expanded to avoid having two separate loops in case the index is a VGPR.
356
357 // Most operations are naturally 32-bit vector operations. We only support
358 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
359 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
360 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
361 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
362
363 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
364 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
365
366 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
367 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
368
369 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
370 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
371 }
372
373 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
374 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
375 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
376
377 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
378 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
379
380 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
381 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
382
383 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
384 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
385 }
386
387 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
388 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
389 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
390
391 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
392 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
393
394 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
395 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
396
397 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
398 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
399 }
400
401 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
402 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
403 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
404
405 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
406 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
407
408 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
409 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
410
411 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
412 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
413 }
414
415 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
416 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
417 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
418
419 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
420 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
421
422 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
423 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
424
425 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
426 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
427 }
428
429 setOperationAction(ISD::VECTOR_SHUFFLE,
430 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
431 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
432 Custom);
433
434 if (Subtarget->hasPkMovB32()) {
435 // TODO: 16-bit element vectors should be legal with even aligned elements.
436 // TODO: Can be legal with wider source types than the result with
437 // subregister extracts.
438 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
439 }
440
441 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442 Custom);
443
444 // Avoid stack access for these.
445 // TODO: Generalize to more vector types.
446 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
447 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
448 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
449 Custom);
450
451 // Deal with vec3 vector operations when widened to vec4.
452 setOperationAction(ISD::INSERT_SUBVECTOR,
453 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
454
455 // Deal with vec5/6/7 vector operations when widened to vec8.
456 setOperationAction(ISD::INSERT_SUBVECTOR,
457 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
458 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
459 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
460 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
461 Custom);
462
463 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
464 // and output demarshalling
465 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
466
467 // We can't return success/failure, only the old value,
468 // let LLVM add the comparison
469 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
470 Expand);
471
472 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
473
474 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
475
476 // FIXME: This should be narrowed to i32, but that only happens if i64 is
477 // illegal.
478 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
479 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
480
481 // On SI this is s_memtime and s_memrealtime on VI.
482 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
483
484 if (Subtarget->hasSMemRealTime() ||
485 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
486 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
487 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
488
489 if (Subtarget->has16BitInsts()) {
490 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
491 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
492 } else {
493 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
494 }
495
496 if (Subtarget->hasMadMacF32Insts())
497 setOperationAction(ISD::FMAD, MVT::f32, Legal);
498
499 if (!Subtarget->hasBFI())
500 // fcopysign can be done in a single instruction with BFI.
501 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
502
503 if (!Subtarget->hasBCNT(32))
504 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
505
506 if (!Subtarget->hasBCNT(64))
507 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
508
509 if (Subtarget->hasFFBH())
510 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
511
512 if (Subtarget->hasFFBL())
513 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
514
515 // We only really have 32-bit BFE instructions (and 16-bit on VI).
516 //
517 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
518 // effort to match them now. We want this to be false for i64 cases when the
519 // extraction isn't restricted to the upper or lower half. Ideally we would
520 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
521 // span the midpoint are probably relatively rare, so don't worry about them
522 // for now.
523 if (Subtarget->hasBFE())
524 setHasExtractBitsInsn(true);
525
526 // Clamp modifier on add/sub
527 if (Subtarget->hasIntClamp())
528 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
529
530 if (Subtarget->hasAddNoCarry())
531 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
532 Legal);
533
534 setOperationAction(
535 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
536 {MVT::f32, MVT::f64}, Custom);
537
538 // These are really only legal for ieee_mode functions. We should be avoiding
539 // them for functions that don't have ieee_mode enabled, so just say they are
540 // legal.
541 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
542 {MVT::f32, MVT::f64}, Legal);
543
544 if (Subtarget->haveRoundOpsF64())
545 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
546 Legal);
547 else
548 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
549 MVT::f64, Custom);
550
551 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
552 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
553 Legal);
554 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
555
556 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
557 setOperationAction(ISD::FDIV, MVT::f64, Custom);
558
559 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
560 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
561
562 // Custom lower these because we can't specify a rule based on an illegal
563 // source bf16.
564 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
565 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
566
567 if (Subtarget->has16BitInsts()) {
568 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
569 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
570 MVT::i16, Legal);
571
572 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
573
574 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
575 MVT::i16, Expand);
576
577 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
578 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
579 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
580 ISD::CTPOP},
581 MVT::i16, Promote);
582
583 setOperationAction(ISD::LOAD, MVT::i16, Custom);
584
585 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
586
587 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
588 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
589 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
590 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
591
592 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
593 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
594 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i1, Custom);
595
596 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
597
598 // F16 - Constant Actions.
599 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
600 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
601
602 // F16 - Load/Store Actions.
603 setOperationAction(ISD::LOAD, MVT::f16, Promote);
604 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
605 setOperationAction(ISD::STORE, MVT::f16, Promote);
606 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
607
608 // BF16 - Load/Store Actions.
609 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
610 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
611 setOperationAction(ISD::STORE, MVT::bf16, Promote);
612 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
613
614 // F16 - VOP1 Actions.
615 setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
616 ISD::FSIN, ISD::FROUND},
617 MVT::f16, Custom);
618
619 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
620 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
621
622 // F16 - VOP2 Actions.
623 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
624 Expand);
625 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
626 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
627 setOperationAction(ISD::FDIV, MVT::f16, Custom);
628
629 // F16 - VOP3 Actions.
630 setOperationAction(ISD::FMA, MVT::f16, Legal);
631 if (STI.hasMadF16())
632 setOperationAction(ISD::FMAD, MVT::f16, Legal);
633
634 for (MVT VT :
635 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
636 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
637 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
638 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
639 switch (Op) {
640 case ISD::LOAD:
641 case ISD::STORE:
642 case ISD::BUILD_VECTOR:
643 case ISD::BITCAST:
644 case ISD::UNDEF:
645 case ISD::EXTRACT_VECTOR_ELT:
646 case ISD::INSERT_VECTOR_ELT:
647 case ISD::INSERT_SUBVECTOR:
648 case ISD::SCALAR_TO_VECTOR:
649 case ISD::IS_FPCLASS:
650 break;
651 case ISD::EXTRACT_SUBVECTOR:
652 case ISD::CONCAT_VECTORS:
653 setOperationAction(Op, VT, Custom);
654 break;
655 default:
656 setOperationAction(Op, VT, Expand);
657 break;
658 }
659 }
660 }
661
662 // v_perm_b32 can handle either of these.
663 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
664 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
665
666 // XXX - Do these do anything? Vector constants turn into build_vector.
667 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
668
669 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
670 Legal);
671
672 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
675 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
676
677 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
678 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
679 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
680 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
681
682 setOperationAction(ISD::AND, MVT::v2i16, Promote);
683 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
684 setOperationAction(ISD::OR, MVT::v2i16, Promote);
685 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
686 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
687 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
688
689 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
690 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
691 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
693 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
694 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
695
696 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
697 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
698 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
699 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
700 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
701 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
702
703 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
704 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
705 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
707 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
711 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
712 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
713 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
714
715 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
717 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
718 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
719 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
720 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
721
722 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
724 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
726 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
728
729 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
730 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
731 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
733 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
734 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
735
736 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
737 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
738 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
739 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
740 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
741 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
742
743 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
744 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
745 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
746 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
747 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
748 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
749
750 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
751 MVT::v2i32, Expand);
752 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
753
754 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
755 MVT::v4i32, Expand);
756
757 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
758 MVT::v8i32, Expand);
759
760 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
761 Subtarget->hasVOP3PInsts() ? Legal : Custom);
762
763 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
764 // This isn't really legal, but this avoids the legalizer unrolling it (and
765 // allows matching fneg (fabs x) patterns)
766 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
767
768 // Can do this in one BFI plus a constant materialize.
769 setOperationAction(ISD::FCOPYSIGN,
770 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
771 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
772 MVT::v32f16, MVT::v32bf16},
773 Custom);
774
775 setOperationAction(
776 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
777 MVT::f16, Custom);
778 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
779
780 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
781 ISD::FMAXIMUMNUM},
782 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
783 Custom);
784
785 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
786 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
787 Expand);
788
789 for (MVT Vec16 :
790 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
791 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792 setOperationAction(
793 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
794 Vec16, Custom);
795 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
796 }
797 }
798
799 if (Subtarget->hasVOP3PInsts()) {
800 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
801 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
802 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
803 MVT::v2i16, Legal);
804
805 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
806 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
807 MVT::v2f16, Legal);
808
809 setOperationAction(ISD::EXTRACT_VECTOR_ELT,
810 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
811
812 setOperationAction(ISD::VECTOR_SHUFFLE,
813 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
814 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
815 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
816 Custom);
817
818 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
819 // Split vector operations.
820 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
821 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
822 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
823 ISD::SSUBSAT},
824 VT, Custom);
825
826 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
827 // Split vector operations.
828 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
829 VT, Custom);
830
831 setOperationAction(
832 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
833 {MVT::v2f16, MVT::v4f16}, Custom);
834
835 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
836 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
837 Custom);
838
839 if (Subtarget->hasPackedFP32Ops()) {
840 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
841 MVT::v2f32, Legal);
842 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
843 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
844 Custom);
845 }
846 }
847
848 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
849
850 if (Subtarget->has16BitInsts()) {
851 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
852 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
853 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
854 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
855 } else {
856 // Legalization hack.
857 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
858
859 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
860 }
861
862 setOperationAction(ISD::SELECT,
863 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
864 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
865 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
866 MVT::v32f16, MVT::v32bf16},
867 Custom);
868
869 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
870
871 if (Subtarget->hasScalarSMulU64())
872 setOperationAction(ISD::MUL, MVT::i64, Custom);
873
874 if (Subtarget->hasMad64_32())
875 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
876
877 if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
878 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
879
880 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
881 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
882 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
883 } else {
884 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
885 if (Subtarget->hasMinimum3Maximum3F32())
886 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
887
888 if (Subtarget->hasMinimum3Maximum3PKF16()) {
889 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
890
891 // If only the vector form is available, we need to widen to a vector.
892 if (!Subtarget->hasMinimum3Maximum3F16())
893 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
894 }
895 }
896
897 if (Subtarget->hasVOP3PInsts()) {
898 // We want to break these into v2f16 pieces, not scalarize.
899 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
900 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
901 Custom);
902 }
903
904 setOperationAction(ISD::INTRINSIC_WO_CHAIN,
905 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
906 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
907 MVT::i8},
908 Custom);
909
910 setOperationAction(ISD::INTRINSIC_W_CHAIN,
911 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
912 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
913 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
914 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
915 Custom);
916
917 setOperationAction(ISD::INTRINSIC_VOID,
918 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
919 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
920 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
921 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
922 Custom);
923
924 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
925 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
926 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
927 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
928 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
929
930 // TODO: Could move this to custom lowering, could benefit from combines on
931 // extract of relevant bits.
932 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
933
934 setOperationAction(ISD::MUL, MVT::i1, Promote);
935
936 if (Subtarget->hasBF16ConversionInsts()) {
937 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
938 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
939 }
940
941 if (Subtarget->hasCvtPkF16F32Inst()) {
942 setOperationAction(ISD::FP_ROUND,
943 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
944 Custom);
945 }
946
947 setTargetDAGCombine({ISD::ADD,
948 ISD::PTRADD,
949 ISD::UADDO_CARRY,
950 ISD::SUB,
951 ISD::USUBO_CARRY,
952 ISD::MUL,
953 ISD::FADD,
954 ISD::FSUB,
955 ISD::FDIV,
956 ISD::FMUL,
957 ISD::FMINNUM,
958 ISD::FMAXNUM,
959 ISD::FMINNUM_IEEE,
960 ISD::FMAXNUM_IEEE,
961 ISD::FMINIMUM,
962 ISD::FMAXIMUM,
963 ISD::FMINIMUMNUM,
964 ISD::FMAXIMUMNUM,
965 ISD::FMA,
966 ISD::SMIN,
967 ISD::SMAX,
968 ISD::UMIN,
969 ISD::UMAX,
970 ISD::SETCC,
971 ISD::SELECT,
972 ISD::SMIN,
973 ISD::SMAX,
974 ISD::UMIN,
975 ISD::UMAX,
976 ISD::AND,
977 ISD::OR,
978 ISD::XOR,
979 ISD::SHL,
980 ISD::SRL,
981 ISD::SRA,
982 ISD::FSHR,
983 ISD::SINT_TO_FP,
984 ISD::UINT_TO_FP,
985 ISD::FCANONICALIZE,
986 ISD::SCALAR_TO_VECTOR,
987 ISD::ZERO_EXTEND,
988 ISD::SIGN_EXTEND_INREG,
989 ISD::EXTRACT_VECTOR_ELT,
990 ISD::INSERT_VECTOR_ELT,
991 ISD::FCOPYSIGN});
992
993 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
994 setTargetDAGCombine(ISD::FP_ROUND);
995
996 // All memory operations. Some folding on the pointer operand is done to help
997 // matching the constant offsets in the addressing modes.
998 setTargetDAGCombine({ISD::LOAD,
999 ISD::STORE,
1000 ISD::ATOMIC_LOAD,
1001 ISD::ATOMIC_STORE,
1002 ISD::ATOMIC_CMP_SWAP,
1003 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1004 ISD::ATOMIC_SWAP,
1005 ISD::ATOMIC_LOAD_ADD,
1006 ISD::ATOMIC_LOAD_SUB,
1007 ISD::ATOMIC_LOAD_AND,
1008 ISD::ATOMIC_LOAD_OR,
1009 ISD::ATOMIC_LOAD_XOR,
1010 ISD::ATOMIC_LOAD_NAND,
1011 ISD::ATOMIC_LOAD_MIN,
1012 ISD::ATOMIC_LOAD_MAX,
1013 ISD::ATOMIC_LOAD_UMIN,
1014 ISD::ATOMIC_LOAD_UMAX,
1015 ISD::ATOMIC_LOAD_FADD,
1016 ISD::ATOMIC_LOAD_FMIN,
1017 ISD::ATOMIC_LOAD_FMAX,
1018 ISD::ATOMIC_LOAD_UINC_WRAP,
1019 ISD::ATOMIC_LOAD_UDEC_WRAP,
1020 ISD::INTRINSIC_VOID,
1021 ISD::INTRINSIC_W_CHAIN});
1022
1023 // FIXME: In other contexts we pretend this is a per-function property.
1024 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1025
1026 setSchedulingPreference(Sched::RegPressure);
1027 }
1028
getSubtarget() const1029 const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1030
getRoundingControlRegisters() const1031 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1032 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1033 return RCRegs;
1034 }
1035
1036 //===----------------------------------------------------------------------===//
1037 // TargetLowering queries
1038 //===----------------------------------------------------------------------===//
1039
1040 // v_mad_mix* support a conversion from f16 to f32.
1041 //
1042 // There is only one special case when denormals are enabled we don't currently,
1043 // where this is OK to use.
isFPExtFoldable(const SelectionDAG & DAG,unsigned Opcode,EVT DestVT,EVT SrcVT) const1044 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1045 EVT DestVT, EVT SrcVT) const {
1046 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1047 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1048 DestVT.getScalarType() == MVT::f32 &&
1049 SrcVT.getScalarType() == MVT::f16 &&
1050 // TODO: This probably only requires no input flushing?
1051 denormalModeIsFlushAllF32(DAG.getMachineFunction());
1052 }
1053
isFPExtFoldable(const MachineInstr & MI,unsigned Opcode,LLT DestTy,LLT SrcTy) const1054 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1055 LLT DestTy, LLT SrcTy) const {
1056 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1057 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1058 DestTy.getScalarSizeInBits() == 32 &&
1059 SrcTy.getScalarSizeInBits() == 16 &&
1060 // TODO: This probably only requires no input flushing?
1061 denormalModeIsFlushAllF32(*MI.getMF());
1062 }
1063
isShuffleMaskLegal(ArrayRef<int>,EVT) const1064 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1065 // SI has some legal vector types, but no legal vector operations. Say no
1066 // shuffles are legal in order to prefer scalarizing some vector operations.
1067 return false;
1068 }
1069
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const1070 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1071 CallingConv::ID CC,
1072 EVT VT) const {
1073 if (CC == CallingConv::AMDGPU_KERNEL)
1074 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1075
1076 if (VT.isVector()) {
1077 EVT ScalarVT = VT.getScalarType();
1078 unsigned Size = ScalarVT.getSizeInBits();
1079 if (Size == 16) {
1080 if (Subtarget->has16BitInsts()) {
1081 if (VT.isInteger())
1082 return MVT::v2i16;
1083 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1084 }
1085 return VT.isInteger() ? MVT::i32 : MVT::f32;
1086 }
1087
1088 if (Size < 16)
1089 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1090 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1091 }
1092
1093 if (VT.getSizeInBits() > 32)
1094 return MVT::i32;
1095
1096 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1097 }
1098
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const1099 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1100 CallingConv::ID CC,
1101 EVT VT) const {
1102 if (CC == CallingConv::AMDGPU_KERNEL)
1103 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1104
1105 if (VT.isVector()) {
1106 unsigned NumElts = VT.getVectorNumElements();
1107 EVT ScalarVT = VT.getScalarType();
1108 unsigned Size = ScalarVT.getSizeInBits();
1109
1110 // FIXME: Should probably promote 8-bit vectors to i16.
1111 if (Size == 16 && Subtarget->has16BitInsts())
1112 return (NumElts + 1) / 2;
1113
1114 if (Size <= 32)
1115 return NumElts;
1116
1117 if (Size > 32)
1118 return NumElts * ((Size + 31) / 32);
1119 } else if (VT.getSizeInBits() > 32)
1120 return (VT.getSizeInBits() + 31) / 32;
1121
1122 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1123 }
1124
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const1125 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1126 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1127 unsigned &NumIntermediates, MVT &RegisterVT) const {
1128 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1129 unsigned NumElts = VT.getVectorNumElements();
1130 EVT ScalarVT = VT.getScalarType();
1131 unsigned Size = ScalarVT.getSizeInBits();
1132 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1133 // support, but unless we can properly handle 3-vectors, it will be still be
1134 // inconsistent.
1135 if (Size == 16 && Subtarget->has16BitInsts()) {
1136 if (ScalarVT == MVT::bf16) {
1137 RegisterVT = MVT::i32;
1138 IntermediateVT = MVT::v2bf16;
1139 } else {
1140 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1141 IntermediateVT = RegisterVT;
1142 }
1143 NumIntermediates = (NumElts + 1) / 2;
1144 return NumIntermediates;
1145 }
1146
1147 if (Size == 32) {
1148 RegisterVT = ScalarVT.getSimpleVT();
1149 IntermediateVT = RegisterVT;
1150 NumIntermediates = NumElts;
1151 return NumIntermediates;
1152 }
1153
1154 if (Size < 16 && Subtarget->has16BitInsts()) {
1155 // FIXME: Should probably form v2i16 pieces
1156 RegisterVT = MVT::i16;
1157 IntermediateVT = ScalarVT;
1158 NumIntermediates = NumElts;
1159 return NumIntermediates;
1160 }
1161
1162 if (Size != 16 && Size <= 32) {
1163 RegisterVT = MVT::i32;
1164 IntermediateVT = ScalarVT;
1165 NumIntermediates = NumElts;
1166 return NumIntermediates;
1167 }
1168
1169 if (Size > 32) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = RegisterVT;
1172 NumIntermediates = NumElts * ((Size + 31) / 32);
1173 return NumIntermediates;
1174 }
1175 }
1176
1177 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1178 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1179 }
1180
memVTFromLoadIntrData(const SITargetLowering & TLI,const DataLayout & DL,Type * Ty,unsigned MaxNumLanes)1181 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1182 const DataLayout &DL, Type *Ty,
1183 unsigned MaxNumLanes) {
1184 assert(MaxNumLanes != 0);
1185
1186 LLVMContext &Ctx = Ty->getContext();
1187 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1188 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1189 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1190 NumElts);
1191 }
1192
1193 return TLI.getValueType(DL, Ty);
1194 }
1195
1196 // Peek through TFE struct returns to only use the data size.
memVTFromLoadIntrReturn(const SITargetLowering & TLI,const DataLayout & DL,Type * Ty,unsigned MaxNumLanes)1197 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1198 const DataLayout &DL, Type *Ty,
1199 unsigned MaxNumLanes) {
1200 auto *ST = dyn_cast<StructType>(Ty);
1201 if (!ST)
1202 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1203
1204 // TFE intrinsics return an aggregate type.
1205 assert(ST->getNumContainedTypes() == 2 &&
1206 ST->getContainedType(1)->isIntegerTy(32));
1207 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1208 }
1209
1210 /// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1211 /// in-memory representation. This return value is a custom type because there
1212 /// is no MVT::i160 and adding one breaks integer promotion logic. While this
1213 /// could cause issues during codegen, these address space 7 pointers will be
1214 /// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1215 /// in order to allow pre-codegen passes that query TargetTransformInfo, often
1216 /// for cost modeling, to work. (This also sets us up decently for doing the
1217 /// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
getPointerTy(const DataLayout & DL,unsigned AS) const1218 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1219 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1220 return MVT::amdgpuBufferFatPointer;
1221 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1222 DL.getPointerSizeInBits(AS) == 192)
1223 return MVT::amdgpuBufferStridedPointer;
1224 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1225 }
1226 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1227 /// v8i32 when padding is added.
1228 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1229 /// also v8i32 with padding.
getPointerMemTy(const DataLayout & DL,unsigned AS) const1230 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1231 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1232 DL.getPointerSizeInBits(AS) == 160) ||
1233 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1234 DL.getPointerSizeInBits(AS) == 192))
1235 return MVT::v8i32;
1236 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1237 }
1238
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & CI,MachineFunction & MF,unsigned IntrID) const1239 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1240 const CallInst &CI,
1241 MachineFunction &MF,
1242 unsigned IntrID) const {
1243 Info.flags = MachineMemOperand::MONone;
1244 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1245 Info.flags |= MachineMemOperand::MOInvariant;
1246 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1247 Info.flags |= MachineMemOperand::MONonTemporal;
1248 Info.flags |= getTargetMMOFlags(CI);
1249
1250 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1251 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1252 AttributeSet Attr =
1253 Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID);
1254 MemoryEffects ME = Attr.getMemoryEffects();
1255 if (ME.doesNotAccessMemory())
1256 return false;
1257
1258 // TODO: Should images get their own address space?
1259 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1260
1261 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1262 if (RsrcIntr->IsImage) {
1263 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1264 AMDGPU::getImageDimIntrinsicInfo(IntrID);
1265 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1266 Info.align.reset();
1267 }
1268
1269 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1270 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1271 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1272 // We conservatively set the memory operand of a buffer intrinsic to the
1273 // base resource pointer, so that we can access alias information about
1274 // those pointers. Cases like "this points at the same value
1275 // but with a different offset" are handled in
1276 // areMemAccessesTriviallyDisjoint.
1277 Info.ptrVal = RsrcArg;
1278 }
1279
1280 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1281 if (!IsSPrefetch) {
1282 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1283 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1284 Info.flags |= MachineMemOperand::MOVolatile;
1285 }
1286
1287 Info.flags |= MachineMemOperand::MODereferenceable;
1288 if (ME.onlyReadsMemory()) {
1289 if (RsrcIntr->IsImage) {
1290 unsigned MaxNumLanes = 4;
1291
1292 if (!BaseOpcode->Gather4) {
1293 // If this isn't a gather, we may have excess loaded elements in the
1294 // IR type. Check the dmask for the real number of elements loaded.
1295 unsigned DMask =
1296 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1297 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1298 }
1299
1300 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1301 CI.getType(), MaxNumLanes);
1302 } else {
1303 Info.memVT =
1304 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1305 std::numeric_limits<unsigned>::max());
1306 }
1307
1308 // FIXME: What does alignment mean for an image?
1309 Info.opc = ISD::INTRINSIC_W_CHAIN;
1310 Info.flags |= MachineMemOperand::MOLoad;
1311 } else if (ME.onlyWritesMemory()) {
1312 Info.opc = ISD::INTRINSIC_VOID;
1313
1314 Type *DataTy = CI.getArgOperand(0)->getType();
1315 if (RsrcIntr->IsImage) {
1316 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1317 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1318 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1319 DMaskLanes);
1320 } else
1321 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1322
1323 Info.flags |= MachineMemOperand::MOStore;
1324 } else {
1325 // Atomic, NoReturn Sampler or prefetch
1326 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1327 : ISD::INTRINSIC_W_CHAIN;
1328 Info.flags |=
1329 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1330
1331 if (!IsSPrefetch)
1332 Info.flags |= MachineMemOperand::MOStore;
1333
1334 switch (IntrID) {
1335 default:
1336 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1337 // Fake memory access type for no return sampler intrinsics
1338 Info.memVT = MVT::i32;
1339 } else {
1340 // XXX - Should this be volatile without known ordering?
1341 Info.flags |= MachineMemOperand::MOVolatile;
1342 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1343 }
1344 break;
1345 case Intrinsic::amdgcn_raw_buffer_load_lds:
1346 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1347 case Intrinsic::amdgcn_struct_buffer_load_lds:
1348 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1349 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1350 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1351 Info.ptrVal = CI.getArgOperand(1);
1352 return true;
1353 }
1354 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1355 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1356 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1357 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1358 Info.memVT =
1359 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1360 std::numeric_limits<unsigned>::max());
1361 Info.flags &= ~MachineMemOperand::MOStore;
1362 return true;
1363 }
1364 }
1365 }
1366 return true;
1367 }
1368
1369 switch (IntrID) {
1370 case Intrinsic::amdgcn_ds_ordered_add:
1371 case Intrinsic::amdgcn_ds_ordered_swap: {
1372 Info.opc = ISD::INTRINSIC_W_CHAIN;
1373 Info.memVT = MVT::getVT(CI.getType());
1374 Info.ptrVal = CI.getOperand(0);
1375 Info.align.reset();
1376 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1377
1378 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1379 if (!Vol->isZero())
1380 Info.flags |= MachineMemOperand::MOVolatile;
1381
1382 return true;
1383 }
1384 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1385 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1386 Info.opc = ISD::INTRINSIC_W_CHAIN;
1387 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1388 Info.ptrVal = nullptr;
1389 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1390 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1391 return true;
1392 }
1393 case Intrinsic::amdgcn_ds_append:
1394 case Intrinsic::amdgcn_ds_consume: {
1395 Info.opc = ISD::INTRINSIC_W_CHAIN;
1396 Info.memVT = MVT::getVT(CI.getType());
1397 Info.ptrVal = CI.getOperand(0);
1398 Info.align.reset();
1399 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1400
1401 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1402 if (!Vol->isZero())
1403 Info.flags |= MachineMemOperand::MOVolatile;
1404
1405 return true;
1406 }
1407 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1408 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1409 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1410 ? ISD::INTRINSIC_W_CHAIN
1411 : ISD::INTRINSIC_VOID;
1412 Info.memVT = MVT::getVT(CI.getType());
1413 Info.ptrVal = CI.getOperand(0);
1414 Info.memVT = MVT::i64;
1415 Info.size = 8;
1416 Info.align.reset();
1417 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1418 return true;
1419 }
1420 case Intrinsic::amdgcn_global_atomic_csub: {
1421 Info.opc = ISD::INTRINSIC_W_CHAIN;
1422 Info.memVT = MVT::getVT(CI.getType());
1423 Info.ptrVal = CI.getOperand(0);
1424 Info.align.reset();
1425 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1426 MachineMemOperand::MOVolatile;
1427 return true;
1428 }
1429 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1430 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1431 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1432 Info.opc = ISD::INTRINSIC_W_CHAIN;
1433 Info.memVT =
1434 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1435 ? CI.getType()
1436 : cast<StructType>(CI.getType())
1437 ->getElementType(0)); // XXX: what is correct VT?
1438
1439 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1440 Info.align.reset();
1441 Info.flags |=
1442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_global_atomic_fmin_num:
1446 case Intrinsic::amdgcn_global_atomic_fmax_num:
1447 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1448 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1449 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1450 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1451 Info.opc = ISD::INTRINSIC_W_CHAIN;
1452 Info.memVT = MVT::getVT(CI.getType());
1453 Info.ptrVal = CI.getOperand(0);
1454 Info.align.reset();
1455 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1456 MachineMemOperand::MODereferenceable |
1457 MachineMemOperand::MOVolatile;
1458 return true;
1459 }
1460 case Intrinsic::amdgcn_ds_load_tr6_b96:
1461 case Intrinsic::amdgcn_ds_load_tr4_b64:
1462 case Intrinsic::amdgcn_ds_load_tr8_b64:
1463 case Intrinsic::amdgcn_ds_load_tr16_b128:
1464 case Intrinsic::amdgcn_global_load_tr6_b96:
1465 case Intrinsic::amdgcn_global_load_tr4_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 case Intrinsic::amdgcn_ds_read_tr4_b64:
1469 case Intrinsic::amdgcn_ds_read_tr6_b96:
1470 case Intrinsic::amdgcn_ds_read_tr8_b64:
1471 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1472 Info.opc = ISD::INTRINSIC_W_CHAIN;
1473 Info.memVT = MVT::getVT(CI.getType());
1474 Info.ptrVal = CI.getOperand(0);
1475 Info.align.reset();
1476 Info.flags |= MachineMemOperand::MOLoad;
1477 return true;
1478 }
1479 case Intrinsic::amdgcn_ds_gws_init:
1480 case Intrinsic::amdgcn_ds_gws_barrier:
1481 case Intrinsic::amdgcn_ds_gws_sema_v:
1482 case Intrinsic::amdgcn_ds_gws_sema_br:
1483 case Intrinsic::amdgcn_ds_gws_sema_p:
1484 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1485 Info.opc = ISD::INTRINSIC_VOID;
1486
1487 const GCNTargetMachine &TM =
1488 static_cast<const GCNTargetMachine &>(getTargetMachine());
1489
1490 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1491 Info.ptrVal = MFI->getGWSPSV(TM);
1492
1493 // This is an abstract access, but we need to specify a type and size.
1494 Info.memVT = MVT::i32;
1495 Info.size = 4;
1496 Info.align = Align(4);
1497
1498 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1499 Info.flags |= MachineMemOperand::MOLoad;
1500 else
1501 Info.flags |= MachineMemOperand::MOStore;
1502 return true;
1503 }
1504 case Intrinsic::amdgcn_load_to_lds:
1505 case Intrinsic::amdgcn_global_load_lds: {
1506 Info.opc = ISD::INTRINSIC_VOID;
1507 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1508 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1509 Info.ptrVal = CI.getArgOperand(1);
1510 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1511 return true;
1512 }
1513 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1514 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1515 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1516 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1517 Info.opc = ISD::INTRINSIC_W_CHAIN;
1518
1519 const GCNTargetMachine &TM =
1520 static_cast<const GCNTargetMachine &>(getTargetMachine());
1521
1522 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1523 Info.ptrVal = MFI->getGWSPSV(TM);
1524
1525 // This is an abstract access, but we need to specify a type and size.
1526 Info.memVT = MVT::i32;
1527 Info.size = 4;
1528 Info.align = Align(4);
1529
1530 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1531 return true;
1532 }
1533 case Intrinsic::amdgcn_s_prefetch_data: {
1534 Info.opc = ISD::INTRINSIC_VOID;
1535 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1536 Info.ptrVal = CI.getArgOperand(0);
1537 Info.flags |= MachineMemOperand::MOLoad;
1538 return true;
1539 }
1540 default:
1541 return false;
1542 }
1543 }
1544
CollectTargetIntrinsicOperands(const CallInst & I,SmallVectorImpl<SDValue> & Ops,SelectionDAG & DAG) const1545 void SITargetLowering::CollectTargetIntrinsicOperands(
1546 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1547 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1548 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1549 // The DAG's ValueType loses the addrspaces.
1550 // Add them as 2 extra Constant operands "from" and "to".
1551 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1552 unsigned DstAS = I.getType()->getPointerAddressSpace();
1553 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1554 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1555 break;
1556 }
1557 default:
1558 break;
1559 }
1560 }
1561
getAddrModeArguments(const IntrinsicInst * II,SmallVectorImpl<Value * > & Ops,Type * & AccessTy) const1562 bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1563 SmallVectorImpl<Value *> &Ops,
1564 Type *&AccessTy) const {
1565 Value *Ptr = nullptr;
1566 switch (II->getIntrinsicID()) {
1567 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1568 case Intrinsic::amdgcn_ds_append:
1569 case Intrinsic::amdgcn_ds_consume:
1570 case Intrinsic::amdgcn_ds_load_tr8_b64:
1571 case Intrinsic::amdgcn_ds_load_tr16_b128:
1572 case Intrinsic::amdgcn_ds_load_tr4_b64:
1573 case Intrinsic::amdgcn_ds_load_tr6_b96:
1574 case Intrinsic::amdgcn_ds_read_tr4_b64:
1575 case Intrinsic::amdgcn_ds_read_tr6_b96:
1576 case Intrinsic::amdgcn_ds_read_tr8_b64:
1577 case Intrinsic::amdgcn_ds_read_tr16_b64:
1578 case Intrinsic::amdgcn_ds_ordered_add:
1579 case Intrinsic::amdgcn_ds_ordered_swap:
1580 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1581 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1582 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1583 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1584 case Intrinsic::amdgcn_global_atomic_csub:
1585 case Intrinsic::amdgcn_global_atomic_fmax_num:
1586 case Intrinsic::amdgcn_global_atomic_fmin_num:
1587 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1588 case Intrinsic::amdgcn_global_load_tr_b64:
1589 case Intrinsic::amdgcn_global_load_tr_b128:
1590 case Intrinsic::amdgcn_global_load_tr4_b64:
1591 case Intrinsic::amdgcn_global_load_tr6_b96:
1592 Ptr = II->getArgOperand(0);
1593 break;
1594 case Intrinsic::amdgcn_load_to_lds:
1595 case Intrinsic::amdgcn_global_load_lds:
1596 Ptr = II->getArgOperand(1);
1597 break;
1598 default:
1599 return false;
1600 }
1601 AccessTy = II->getType();
1602 Ops.push_back(Ptr);
1603 return true;
1604 }
1605
isLegalFlatAddressingMode(const AddrMode & AM,unsigned AddrSpace) const1606 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1607 unsigned AddrSpace) const {
1608 if (!Subtarget->hasFlatInstOffsets()) {
1609 // Flat instructions do not have offsets, and only have the register
1610 // address.
1611 return AM.BaseOffs == 0 && AM.Scale == 0;
1612 }
1613
1614 decltype(SIInstrFlags::FLAT) FlatVariant =
1615 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1616 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1617 : SIInstrFlags::FLAT;
1618
1619 return AM.Scale == 0 &&
1620 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1621 AM.BaseOffs, AddrSpace, FlatVariant));
1622 }
1623
isLegalGlobalAddressingMode(const AddrMode & AM) const1624 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1625 if (Subtarget->hasFlatGlobalInsts())
1626 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS);
1627
1628 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1629 // Assume the we will use FLAT for all global memory accesses
1630 // on VI.
1631 // FIXME: This assumption is currently wrong. On VI we still use
1632 // MUBUF instructions for the r + i addressing mode. As currently
1633 // implemented, the MUBUF instructions only work on buffer < 4GB.
1634 // It may be possible to support > 4GB buffers with MUBUF instructions,
1635 // by setting the stride value in the resource descriptor which would
1636 // increase the size limit to (stride * 4GB). However, this is risky,
1637 // because it has never been validated.
1638 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1639 }
1640
1641 return isLegalMUBUFAddressingMode(AM);
1642 }
1643
isLegalMUBUFAddressingMode(const AddrMode & AM) const1644 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1645 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1646 // additionally can do r + r + i with addr64. 32-bit has more addressing
1647 // mode options. Depending on the resource constant, it can also do
1648 // (i64 r0) + (i32 r1) * (i14 i).
1649 //
1650 // Private arrays end up using a scratch buffer most of the time, so also
1651 // assume those use MUBUF instructions. Scratch loads / stores are currently
1652 // implemented as mubuf instructions with offen bit set, so slightly
1653 // different than the normal addr64.
1654 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1655 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1656 return false;
1657
1658 // FIXME: Since we can split immediate into soffset and immediate offset,
1659 // would it make sense to allow any immediate?
1660
1661 switch (AM.Scale) {
1662 case 0: // r + i or just i, depending on HasBaseReg.
1663 return true;
1664 case 1:
1665 return true; // We have r + r or r + i.
1666 case 2:
1667 if (AM.HasBaseReg) {
1668 // Reject 2 * r + r.
1669 return false;
1670 }
1671
1672 // Allow 2 * r as r + r
1673 // Or 2 * r + i is allowed as r + r + i.
1674 return true;
1675 default: // Don't allow n * r
1676 return false;
1677 }
1678 }
1679
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS,Instruction * I) const1680 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1681 const AddrMode &AM, Type *Ty,
1682 unsigned AS,
1683 Instruction *I) const {
1684 // No global is ever allowed as a base.
1685 if (AM.BaseGV)
1686 return false;
1687
1688 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1689 return isLegalGlobalAddressingMode(AM);
1690
1691 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1692 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1693 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1694 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1695 // If the offset isn't a multiple of 4, it probably isn't going to be
1696 // correctly aligned.
1697 // FIXME: Can we get the real alignment here?
1698 if (AM.BaseOffs % 4 != 0)
1699 return isLegalMUBUFAddressingMode(AM);
1700
1701 if (!Subtarget->hasScalarSubwordLoads()) {
1702 // There are no SMRD extloads, so if we have to do a small type access we
1703 // will use a MUBUF load.
1704 // FIXME?: We also need to do this if unaligned, but we don't know the
1705 // alignment here.
1706 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1707 return isLegalGlobalAddressingMode(AM);
1708 }
1709
1710 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1711 // SMRD instructions have an 8-bit, dword offset on SI.
1712 if (!isUInt<8>(AM.BaseOffs / 4))
1713 return false;
1714 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1715 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1716 // in 8-bits, it can use a smaller encoding.
1717 if (!isUInt<32>(AM.BaseOffs / 4))
1718 return false;
1719 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1720 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1721 if (!isUInt<20>(AM.BaseOffs))
1722 return false;
1723 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1724 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1725 // for S_BUFFER_* instructions).
1726 if (!isInt<21>(AM.BaseOffs))
1727 return false;
1728 } else {
1729 // On GFX12, all offsets are signed 24-bit in bytes.
1730 if (!isInt<24>(AM.BaseOffs))
1731 return false;
1732 }
1733
1734 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1735 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1736 AM.BaseOffs < 0) {
1737 // Scalar (non-buffer) loads can only use a negative offset if
1738 // soffset+offset is non-negative. Since the compiler can only prove that
1739 // in a few special cases, it is safer to claim that negative offsets are
1740 // not supported.
1741 return false;
1742 }
1743
1744 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1745 return true;
1746
1747 if (AM.Scale == 1 && AM.HasBaseReg)
1748 return true;
1749
1750 return false;
1751 }
1752
1753 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1754 return Subtarget->enableFlatScratch()
1755 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
1756 : isLegalMUBUFAddressingMode(AM);
1757
1758 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1759 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1760 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1761 // field.
1762 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1763 // an 8-bit dword offset but we don't know the alignment here.
1764 if (!isUInt<16>(AM.BaseOffs))
1765 return false;
1766
1767 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1768 return true;
1769
1770 if (AM.Scale == 1 && AM.HasBaseReg)
1771 return true;
1772
1773 return false;
1774 }
1775
1776 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1777 // For an unknown address space, this usually means that this is for some
1778 // reason being used for pure arithmetic, and not based on some addressing
1779 // computation. We don't have instructions that compute pointers with any
1780 // addressing modes, so treat them as having no offset like flat
1781 // instructions.
1782 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1783 }
1784
1785 // Assume a user alias of global for unknown address spaces.
1786 return isLegalGlobalAddressingMode(AM);
1787 }
1788
canMergeStoresTo(unsigned AS,EVT MemVT,const MachineFunction & MF) const1789 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1790 const MachineFunction &MF) const {
1791 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1792 return (MemVT.getSizeInBits() <= 4 * 32);
1793 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1794 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1795 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1796 }
1797 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1798 return (MemVT.getSizeInBits() <= 2 * 32);
1799 return true;
1800 }
1801
allowsMisalignedMemoryAccessesImpl(unsigned Size,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * IsFast) const1802 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1803 unsigned Size, unsigned AddrSpace, Align Alignment,
1804 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1805 if (IsFast)
1806 *IsFast = 0;
1807
1808 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1809 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1810 // Check if alignment requirements for ds_read/write instructions are
1811 // disabled.
1812 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1813 return false;
1814
1815 Align RequiredAlignment(
1816 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1817 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1818 Alignment < RequiredAlignment)
1819 return false;
1820
1821 // Either, the alignment requirements are "enabled", or there is an
1822 // unaligned LDS access related hardware bug though alignment requirements
1823 // are "disabled". In either case, we need to check for proper alignment
1824 // requirements.
1825 //
1826 switch (Size) {
1827 case 64:
1828 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1829 // address is negative, then the instruction is incorrectly treated as
1830 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1831 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1832 // load later in the SILoadStoreOptimizer.
1833 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1834 return false;
1835
1836 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1837 // can do a 4 byte aligned, 8 byte access in a single operation using
1838 // ds_read2/write2_b32 with adjacent offsets.
1839 RequiredAlignment = Align(4);
1840
1841 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1842 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1843 // ds_write2_b32 depending on the alignment. In either case with either
1844 // alignment there is no faster way of doing this.
1845
1846 // The numbers returned here and below are not additive, it is a 'speed
1847 // rank'. They are just meant to be compared to decide if a certain way
1848 // of lowering an operation is faster than another. For that purpose
1849 // naturally aligned operation gets it bitsize to indicate that "it
1850 // operates with a speed comparable to N-bit wide load". With the full
1851 // alignment ds128 is slower than ds96 for example. If underaligned it
1852 // is comparable to a speed of a single dword access, which would then
1853 // mean 32 < 128 and it is faster to issue a wide load regardless.
1854 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1855 // wider load which will not be aligned anymore the latter is slower.
1856 if (IsFast)
1857 *IsFast = (Alignment >= RequiredAlignment) ? 64
1858 : (Alignment < Align(4)) ? 32
1859 : 1;
1860 return true;
1861 }
1862
1863 break;
1864 case 96:
1865 if (!Subtarget->hasDS96AndDS128())
1866 return false;
1867
1868 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1869 // gfx8 and older.
1870
1871 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1872 // Naturally aligned access is fastest. However, also report it is Fast
1873 // if memory is aligned less than DWORD. A narrow load or store will be
1874 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1875 // be more of them, so overall we will pay less penalty issuing a single
1876 // instruction.
1877
1878 // See comment on the values above.
1879 if (IsFast)
1880 *IsFast = (Alignment >= RequiredAlignment) ? 96
1881 : (Alignment < Align(4)) ? 32
1882 : 1;
1883 return true;
1884 }
1885
1886 break;
1887 case 128:
1888 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1889 return false;
1890
1891 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1892 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1893 // single operation using ds_read2/write2_b64.
1894 RequiredAlignment = Align(8);
1895
1896 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1897 // Naturally aligned access is fastest. However, also report it is Fast
1898 // if memory is aligned less than DWORD. A narrow load or store will be
1899 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1900 // will be more of them, so overall we will pay less penalty issuing a
1901 // single instruction.
1902
1903 // See comment on the values above.
1904 if (IsFast)
1905 *IsFast = (Alignment >= RequiredAlignment) ? 128
1906 : (Alignment < Align(4)) ? 32
1907 : 1;
1908 return true;
1909 }
1910
1911 break;
1912 default:
1913 if (Size > 32)
1914 return false;
1915
1916 break;
1917 }
1918
1919 // See comment on the values above.
1920 // Note that we have a single-dword or sub-dword here, so if underaligned
1921 // it is a slowest possible access, hence returned value is 0.
1922 if (IsFast)
1923 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1924
1925 return Alignment >= RequiredAlignment ||
1926 Subtarget->hasUnalignedDSAccessEnabled();
1927 }
1928
1929 // FIXME: We have to be conservative here and assume that flat operations
1930 // will access scratch. If we had access to the IR function, then we
1931 // could determine if any private memory was used in the function.
1932 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1933 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1934 bool AlignedBy4 = Alignment >= Align(4);
1935 if (IsFast)
1936 *IsFast = AlignedBy4;
1937
1938 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1939 }
1940
1941 // So long as they are correct, wide global memory operations perform better
1942 // than multiple smaller memory ops -- even when misaligned
1943 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1944 if (IsFast)
1945 *IsFast = Size;
1946
1947 return Alignment >= Align(4) ||
1948 Subtarget->hasUnalignedBufferAccessEnabled();
1949 }
1950
1951 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
1952 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
1953 // out-of-bounds behavior, but in the edge case where an access starts
1954 // out-of-bounds and then enter in-bounds, the entire access would be treated
1955 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
1956 // natural alignment of buffer accesses.
1957 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
1958 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
1959 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1960 if (!Subtarget->hasRelaxedBufferOOBMode() &&
1961 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
1962 return false;
1963 }
1964
1965 // Smaller than dword value must be aligned.
1966 if (Size < 32)
1967 return false;
1968
1969 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1970 // byte-address are ignored, thus forcing Dword alignment.
1971 // This applies to private, global, and constant memory.
1972 if (IsFast)
1973 *IsFast = 1;
1974
1975 return Size >= 32 && Alignment >= Align(4);
1976 }
1977
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,unsigned * IsFast) const1978 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1979 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1980 unsigned *IsFast) const {
1981 return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1982 Alignment, Flags, IsFast);
1983 }
1984
getOptimalMemOpType(LLVMContext & Context,const MemOp & Op,const AttributeList & FuncAttributes) const1985 EVT SITargetLowering::getOptimalMemOpType(
1986 LLVMContext &Context, const MemOp &Op,
1987 const AttributeList &FuncAttributes) const {
1988 // FIXME: Should account for address space here.
1989
1990 // The default fallback uses the private pointer size as a guess for a type to
1991 // use. Make sure we switch these to 64-bit accesses.
1992
1993 if (Op.size() >= 16 &&
1994 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1995 return MVT::v4i32;
1996
1997 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1998 return MVT::v2i32;
1999
2000 // Use the default.
2001 return MVT::Other;
2002 }
2003
isMemOpHasNoClobberedMemOperand(const SDNode * N) const2004 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2005 const MemSDNode *MemNode = cast<MemSDNode>(N);
2006 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2007 }
2008
isNonGlobalAddrSpace(unsigned AS)2009 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2010 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2011 AS == AMDGPUAS::PRIVATE_ADDRESS;
2012 }
2013
isFreeAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const2014 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2015 unsigned DestAS) const {
2016 // Flat -> private/local is a simple truncate.
2017 // Flat -> global is no-op
2018 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2019 return true;
2020
2021 const GCNTargetMachine &TM =
2022 static_cast<const GCNTargetMachine &>(getTargetMachine());
2023 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2024 }
2025
2026 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const2027 SITargetLowering::getPreferredVectorAction(MVT VT) const {
2028 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2029 VT.getScalarType().bitsLE(MVT::i16))
2030 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2031 return TargetLoweringBase::getPreferredVectorAction(VT);
2032 }
2033
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const2034 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2035 Type *Ty) const {
2036 // FIXME: Could be smarter if called for vector constants.
2037 return true;
2038 }
2039
isExtractSubvectorCheap(EVT ResVT,EVT SrcVT,unsigned Index) const2040 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2041 unsigned Index) const {
2042 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
2043 return false;
2044
2045 // TODO: Add more cases that are cheap.
2046 return Index == 0;
2047 }
2048
isExtractVecEltCheap(EVT VT,unsigned Index) const2049 bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2050 // TODO: This should be more aggressive, particular for 16-bit element
2051 // vectors. However there are some mixed improvements and regressions.
2052 EVT EltTy = VT.getVectorElementType();
2053 return EltTy.getSizeInBits() % 32 == 0;
2054 }
2055
isTypeDesirableForOp(unsigned Op,EVT VT) const2056 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2057 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2058 switch (Op) {
2059 case ISD::LOAD:
2060 case ISD::STORE:
2061 return true;
2062 default:
2063 return false;
2064 }
2065 }
2066
2067 // SimplifySetCC uses this function to determine whether or not it should
2068 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2069 if (VT == MVT::i1 && Op == ISD::SETCC)
2070 return false;
2071
2072 return TargetLowering::isTypeDesirableForOp(Op, VT);
2073 }
2074
lowerKernArgParameterPtr(SelectionDAG & DAG,const SDLoc & SL,SDValue Chain,uint64_t Offset) const2075 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2076 const SDLoc &SL,
2077 SDValue Chain,
2078 uint64_t Offset) const {
2079 const DataLayout &DL = DAG.getDataLayout();
2080 MachineFunction &MF = DAG.getMachineFunction();
2081 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2082 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
2083
2084 auto [InputPtrReg, RC, ArgTy] =
2085 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2086
2087 // We may not have the kernarg segment argument if we have no kernel
2088 // arguments.
2089 if (!InputPtrReg)
2090 return DAG.getConstant(Offset, SL, PtrVT);
2091
2092 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2093 SDValue BasePtr = DAG.getCopyFromReg(
2094 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2095
2096 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2097 }
2098
getImplicitArgPtr(SelectionDAG & DAG,const SDLoc & SL) const2099 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2100 const SDLoc &SL) const {
2101 uint64_t Offset =
2102 getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT);
2103 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2104 }
2105
getLDSKernelId(SelectionDAG & DAG,const SDLoc & SL) const2106 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2107 const SDLoc &SL) const {
2108
2109 Function &F = DAG.getMachineFunction().getFunction();
2110 std::optional<uint32_t> KnownSize =
2111 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2112 if (KnownSize.has_value())
2113 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2114 return SDValue();
2115 }
2116
convertArgType(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Val,bool Signed,const ISD::InputArg * Arg) const2117 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2118 const SDLoc &SL, SDValue Val,
2119 bool Signed,
2120 const ISD::InputArg *Arg) const {
2121 // First, if it is a widened vector, narrow it.
2122 if (VT.isVector() &&
2123 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2124 EVT NarrowedVT =
2125 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
2126 VT.getVectorNumElements());
2127 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2128 DAG.getConstant(0, SL, MVT::i32));
2129 }
2130
2131 // Then convert the vector elements or scalar value.
2132 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2133 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2134 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2135 }
2136
2137 if (MemVT.isFloatingPoint())
2138 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2139 else if (Signed)
2140 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2141 else
2142 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2143
2144 return Val;
2145 }
2146
lowerKernargMemParameter(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Chain,uint64_t Offset,Align Alignment,bool Signed,const ISD::InputArg * Arg) const2147 SDValue SITargetLowering::lowerKernargMemParameter(
2148 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2149 uint64_t Offset, Align Alignment, bool Signed,
2150 const ISD::InputArg *Arg) const {
2151 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2152
2153 // Try to avoid using an extload by loading earlier than the argument address,
2154 // and extracting the relevant bits. The load should hopefully be merged with
2155 // the previous argument.
2156 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2157 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2158 int64_t AlignDownOffset = alignDown(Offset, 4);
2159 int64_t OffsetDiff = Offset - AlignDownOffset;
2160
2161 EVT IntVT = MemVT.changeTypeToInteger();
2162
2163 // TODO: If we passed in the base kernel offset we could have a better
2164 // alignment than 4, but we don't really need it.
2165 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2166 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2167 MachineMemOperand::MODereferenceable |
2168 MachineMemOperand::MOInvariant);
2169
2170 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2171 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2172
2173 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2174 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2175 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2176
2177 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2178 }
2179
2180 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2181 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2182 MachineMemOperand::MODereferenceable |
2183 MachineMemOperand::MOInvariant);
2184
2185 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2186 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2187 }
2188
lowerStackParameter(SelectionDAG & DAG,CCValAssign & VA,const SDLoc & SL,SDValue Chain,const ISD::InputArg & Arg) const2189 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2190 CCValAssign &VA, const SDLoc &SL,
2191 SDValue Chain,
2192 const ISD::InputArg &Arg) const {
2193 MachineFunction &MF = DAG.getMachineFunction();
2194 MachineFrameInfo &MFI = MF.getFrameInfo();
2195
2196 if (Arg.Flags.isByVal()) {
2197 unsigned Size = Arg.Flags.getByValSize();
2198 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2199 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2200 }
2201
2202 unsigned ArgOffset = VA.getLocMemOffset();
2203 unsigned ArgSize = VA.getValVT().getStoreSize();
2204
2205 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2206
2207 // Create load nodes to retrieve arguments from the stack.
2208 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2209 SDValue ArgValue;
2210
2211 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2212 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2213 MVT MemVT = VA.getValVT();
2214
2215 switch (VA.getLocInfo()) {
2216 default:
2217 break;
2218 case CCValAssign::BCvt:
2219 MemVT = VA.getLocVT();
2220 break;
2221 case CCValAssign::SExt:
2222 ExtType = ISD::SEXTLOAD;
2223 break;
2224 case CCValAssign::ZExt:
2225 ExtType = ISD::ZEXTLOAD;
2226 break;
2227 case CCValAssign::AExt:
2228 ExtType = ISD::EXTLOAD;
2229 break;
2230 }
2231
2232 ArgValue = DAG.getExtLoad(
2233 ExtType, SL, VA.getLocVT(), Chain, FIN,
2234 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT);
2235 return ArgValue;
2236 }
2237
getPreloadedValue(SelectionDAG & DAG,const SIMachineFunctionInfo & MFI,EVT VT,AMDGPUFunctionArgInfo::PreloadedValue PVID) const2238 SDValue SITargetLowering::getPreloadedValue(
2239 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2240 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2241 const ArgDescriptor *Reg = nullptr;
2242 const TargetRegisterClass *RC;
2243 LLT Ty;
2244
2245 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2246 const ArgDescriptor WorkGroupIDX =
2247 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2248 // If GridZ is not programmed in an entry function then the hardware will set
2249 // it to all zeros, so there is no need to mask the GridY value in the low
2250 // order bits.
2251 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2252 AMDGPU::TTMP7,
2253 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2254 const ArgDescriptor WorkGroupIDZ =
2255 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2256 if (Subtarget->hasArchitectedSGPRs() &&
2257 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2258 switch (PVID) {
2259 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2260 Reg = &WorkGroupIDX;
2261 RC = &AMDGPU::SReg_32RegClass;
2262 Ty = LLT::scalar(32);
2263 break;
2264 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2265 Reg = &WorkGroupIDY;
2266 RC = &AMDGPU::SReg_32RegClass;
2267 Ty = LLT::scalar(32);
2268 break;
2269 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2270 Reg = &WorkGroupIDZ;
2271 RC = &AMDGPU::SReg_32RegClass;
2272 Ty = LLT::scalar(32);
2273 break;
2274 default:
2275 break;
2276 }
2277 }
2278
2279 if (!Reg)
2280 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2281 if (!Reg) {
2282 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2283 // It's possible for a kernarg intrinsic call to appear in a kernel with
2284 // no allocated segment, in which case we do not add the user sgpr
2285 // argument, so just return null.
2286 return DAG.getConstant(0, SDLoc(), VT);
2287 }
2288
2289 // It's undefined behavior if a function marked with the amdgpu-no-*
2290 // attributes uses the corresponding intrinsic.
2291 return DAG.getPOISON(VT);
2292 }
2293
2294 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2295 }
2296
processPSInputArgs(SmallVectorImpl<ISD::InputArg> & Splits,CallingConv::ID CallConv,ArrayRef<ISD::InputArg> Ins,BitVector & Skipped,FunctionType * FType,SIMachineFunctionInfo * Info)2297 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2298 CallingConv::ID CallConv,
2299 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2300 FunctionType *FType,
2301 SIMachineFunctionInfo *Info) {
2302 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2303 const ISD::InputArg *Arg = &Ins[I];
2304
2305 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2306 "vector type argument should have been split");
2307
2308 // First check if it's a PS input addr.
2309 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2310 PSInputNum <= 15) {
2311 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2312
2313 // Inconveniently only the first part of the split is marked as isSplit,
2314 // so skip to the end. We only want to increment PSInputNum once for the
2315 // entire split argument.
2316 if (Arg->Flags.isSplit()) {
2317 while (!Arg->Flags.isSplitEnd()) {
2318 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2319 "unexpected vector split in ps argument type");
2320 if (!SkipArg)
2321 Splits.push_back(*Arg);
2322 Arg = &Ins[++I];
2323 }
2324 }
2325
2326 if (SkipArg) {
2327 // We can safely skip PS inputs.
2328 Skipped.set(Arg->getOrigArgIndex());
2329 ++PSInputNum;
2330 continue;
2331 }
2332
2333 Info->markPSInputAllocated(PSInputNum);
2334 if (Arg->Used)
2335 Info->markPSInputEnabled(PSInputNum);
2336
2337 ++PSInputNum;
2338 }
2339
2340 Splits.push_back(*Arg);
2341 }
2342 }
2343
2344 // Allocate special inputs passed in VGPRs.
allocateSpecialEntryInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2345 void SITargetLowering::allocateSpecialEntryInputVGPRs(
2346 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2347 SIMachineFunctionInfo &Info) const {
2348 const LLT S32 = LLT::scalar(32);
2349 MachineRegisterInfo &MRI = MF.getRegInfo();
2350
2351 if (Info.hasWorkItemIDX()) {
2352 Register Reg = AMDGPU::VGPR0;
2353 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2354
2355 CCInfo.AllocateReg(Reg);
2356 unsigned Mask =
2357 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2358 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2359 }
2360
2361 if (Info.hasWorkItemIDY()) {
2362 assert(Info.hasWorkItemIDX());
2363 if (Subtarget->hasPackedTID()) {
2364 Info.setWorkItemIDY(
2365 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2366 } else {
2367 unsigned Reg = AMDGPU::VGPR1;
2368 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2369
2370 CCInfo.AllocateReg(Reg);
2371 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2372 }
2373 }
2374
2375 if (Info.hasWorkItemIDZ()) {
2376 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2377 if (Subtarget->hasPackedTID()) {
2378 Info.setWorkItemIDZ(
2379 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2380 } else {
2381 unsigned Reg = AMDGPU::VGPR2;
2382 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2383
2384 CCInfo.AllocateReg(Reg);
2385 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2386 }
2387 }
2388 }
2389
2390 // Try to allocate a VGPR at the end of the argument list, or if no argument
2391 // VGPRs are left allocating a stack slot.
2392 // If \p Mask is is given it indicates bitfield position in the register.
2393 // If \p Arg is given use it with new ]p Mask instead of allocating new.
allocateVGPR32Input(CCState & CCInfo,unsigned Mask=~0u,ArgDescriptor Arg=ArgDescriptor ())2394 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2395 ArgDescriptor Arg = ArgDescriptor()) {
2396 if (Arg.isSet())
2397 return ArgDescriptor::createArg(Arg, Mask);
2398
2399 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2400 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2401 if (RegIdx == ArgVGPRs.size()) {
2402 // Spill to stack required.
2403 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2404
2405 return ArgDescriptor::createStack(Offset, Mask);
2406 }
2407
2408 unsigned Reg = ArgVGPRs[RegIdx];
2409 Reg = CCInfo.AllocateReg(Reg);
2410 assert(Reg != AMDGPU::NoRegister);
2411
2412 MachineFunction &MF = CCInfo.getMachineFunction();
2413 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2414 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2415 return ArgDescriptor::createRegister(Reg, Mask);
2416 }
2417
allocateSGPR32InputImpl(CCState & CCInfo,const TargetRegisterClass * RC,unsigned NumArgRegs)2418 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2419 const TargetRegisterClass *RC,
2420 unsigned NumArgRegs) {
2421 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2422 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2423 if (RegIdx == ArgSGPRs.size())
2424 report_fatal_error("ran out of SGPRs for arguments");
2425
2426 unsigned Reg = ArgSGPRs[RegIdx];
2427 Reg = CCInfo.AllocateReg(Reg);
2428 assert(Reg != AMDGPU::NoRegister);
2429
2430 MachineFunction &MF = CCInfo.getMachineFunction();
2431 MF.addLiveIn(Reg, RC);
2432 return ArgDescriptor::createRegister(Reg);
2433 }
2434
2435 // If this has a fixed position, we still should allocate the register in the
2436 // CCInfo state. Technically we could get away with this for values passed
2437 // outside of the normal argument range.
allocateFixedSGPRInputImpl(CCState & CCInfo,const TargetRegisterClass * RC,MCRegister Reg)2438 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2439 const TargetRegisterClass *RC,
2440 MCRegister Reg) {
2441 Reg = CCInfo.AllocateReg(Reg);
2442 assert(Reg != AMDGPU::NoRegister);
2443 MachineFunction &MF = CCInfo.getMachineFunction();
2444 MF.addLiveIn(Reg, RC);
2445 }
2446
allocateSGPR32Input(CCState & CCInfo,ArgDescriptor & Arg)2447 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2448 if (Arg) {
2449 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2450 Arg.getRegister());
2451 } else
2452 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2453 }
2454
allocateSGPR64Input(CCState & CCInfo,ArgDescriptor & Arg)2455 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2456 if (Arg) {
2457 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2458 Arg.getRegister());
2459 } else
2460 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2461 }
2462
2463 /// Allocate implicit function VGPR arguments at the end of allocated user
2464 /// arguments.
allocateSpecialInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2465 void SITargetLowering::allocateSpecialInputVGPRs(
2466 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2467 SIMachineFunctionInfo &Info) const {
2468 const unsigned Mask = 0x3ff;
2469 ArgDescriptor Arg;
2470
2471 if (Info.hasWorkItemIDX()) {
2472 Arg = allocateVGPR32Input(CCInfo, Mask);
2473 Info.setWorkItemIDX(Arg);
2474 }
2475
2476 if (Info.hasWorkItemIDY()) {
2477 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2478 Info.setWorkItemIDY(Arg);
2479 }
2480
2481 if (Info.hasWorkItemIDZ())
2482 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2483 }
2484
2485 /// Allocate implicit function VGPR arguments in fixed registers.
allocateSpecialInputVGPRsFixed(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2486 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2487 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2488 SIMachineFunctionInfo &Info) const {
2489 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2490 if (!Reg)
2491 report_fatal_error("failed to allocate VGPR for implicit arguments");
2492
2493 const unsigned Mask = 0x3ff;
2494 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2495 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2496 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2497 }
2498
allocateSpecialInputSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2499 void SITargetLowering::allocateSpecialInputSGPRs(
2500 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2501 SIMachineFunctionInfo &Info) const {
2502 auto &ArgInfo = Info.getArgInfo();
2503 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2504
2505 // TODO: Unify handling with private memory pointers.
2506 if (UserSGPRInfo.hasDispatchPtr())
2507 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2508
2509 if (UserSGPRInfo.hasQueuePtr())
2510 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2511
2512 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2513 // constant offset from the kernarg segment.
2514 if (Info.hasImplicitArgPtr())
2515 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2516
2517 if (UserSGPRInfo.hasDispatchID())
2518 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2519
2520 // flat_scratch_init is not applicable for non-kernel functions.
2521
2522 if (Info.hasWorkGroupIDX())
2523 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2524
2525 if (Info.hasWorkGroupIDY())
2526 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2527
2528 if (Info.hasWorkGroupIDZ())
2529 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2530
2531 if (Info.hasLDSKernelId())
2532 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2533 }
2534
2535 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2536 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2537 MachineFunction &MF,
2538 const SIRegisterInfo &TRI,
2539 SIMachineFunctionInfo &Info) const {
2540 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2541 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2542 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2543 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2544 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2545 }
2546
2547 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2548 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2549 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2550 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2551 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2552 }
2553
2554 if (UserSGPRInfo.hasDispatchPtr()) {
2555 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2556 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2557 CCInfo.AllocateReg(DispatchPtrReg);
2558 }
2559
2560 if (UserSGPRInfo.hasQueuePtr()) {
2561 Register QueuePtrReg = Info.addQueuePtr(TRI);
2562 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2563 CCInfo.AllocateReg(QueuePtrReg);
2564 }
2565
2566 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2567 MachineRegisterInfo &MRI = MF.getRegInfo();
2568 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2569 CCInfo.AllocateReg(InputPtrReg);
2570
2571 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2572 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2573 }
2574
2575 if (UserSGPRInfo.hasDispatchID()) {
2576 Register DispatchIDReg = Info.addDispatchID(TRI);
2577 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2578 CCInfo.AllocateReg(DispatchIDReg);
2579 }
2580
2581 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2582 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2583 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2584 CCInfo.AllocateReg(FlatScratchInitReg);
2585 }
2586
2587 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2588 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2589 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2590 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2591 }
2592
2593 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2594 // these from the dispatch pointer.
2595 }
2596
2597 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2598 // sequential starting from the first argument.
allocatePreloadKernArgSGPRs(CCState & CCInfo,SmallVectorImpl<CCValAssign> & ArgLocs,const SmallVectorImpl<ISD::InputArg> & Ins,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2599 void SITargetLowering::allocatePreloadKernArgSGPRs(
2600 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2601 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2602 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2603 Function &F = MF.getFunction();
2604 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2605 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2606 bool InPreloadSequence = true;
2607 unsigned InIdx = 0;
2608 bool AlignedForImplictArgs = false;
2609 unsigned ImplicitArgOffset = 0;
2610 for (auto &Arg : F.args()) {
2611 if (!InPreloadSequence || !Arg.hasInRegAttr())
2612 break;
2613
2614 unsigned ArgIdx = Arg.getArgNo();
2615 // Don't preload non-original args or parts not in the current preload
2616 // sequence.
2617 if (InIdx < Ins.size() &&
2618 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2619 break;
2620
2621 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2622 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2623 InIdx++) {
2624 assert(ArgLocs[ArgIdx].isMemLoc());
2625 auto &ArgLoc = ArgLocs[InIdx];
2626 const Align KernelArgBaseAlign = Align(16);
2627 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2628 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2629 unsigned NumAllocSGPRs =
2630 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2631
2632 // Fix alignment for hidden arguments.
2633 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2634 if (!AlignedForImplictArgs) {
2635 ImplicitArgOffset =
2636 alignTo(LastExplicitArgOffset,
2637 Subtarget->getAlignmentForImplicitArgPtr()) -
2638 LastExplicitArgOffset;
2639 AlignedForImplictArgs = true;
2640 }
2641 ArgOffset += ImplicitArgOffset;
2642 }
2643
2644 // Arg is preloaded into the previous SGPR.
2645 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2646 assert(InIdx >= 1 && "No previous SGPR");
2647 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2648 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2649 continue;
2650 }
2651
2652 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2653 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2654 // Check for free user SGPRs for preloading.
2655 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2656 InPreloadSequence = false;
2657 break;
2658 }
2659
2660 // Preload this argument.
2661 const TargetRegisterClass *RC =
2662 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2663 SmallVectorImpl<MCRegister> *PreloadRegs =
2664 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2665
2666 if (PreloadRegs->size() > 1)
2667 RC = &AMDGPU::SGPR_32RegClass;
2668 for (auto &Reg : *PreloadRegs) {
2669 assert(Reg);
2670 MF.addLiveIn(Reg, RC);
2671 CCInfo.AllocateReg(Reg);
2672 }
2673
2674 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2675 }
2676 }
2677 }
2678
allocateLDSKernelId(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2679 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2680 const SIRegisterInfo &TRI,
2681 SIMachineFunctionInfo &Info) const {
2682 // Always allocate this last since it is a synthetic preload.
2683 if (Info.hasLDSKernelId()) {
2684 Register Reg = Info.addLDSKernelId();
2685 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2686 CCInfo.AllocateReg(Reg);
2687 }
2688 }
2689
2690 // Allocate special input registers that are initialized per-wave.
allocateSystemSGPRs(CCState & CCInfo,MachineFunction & MF,SIMachineFunctionInfo & Info,CallingConv::ID CallConv,bool IsShader) const2691 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
2692 SIMachineFunctionInfo &Info,
2693 CallingConv::ID CallConv,
2694 bool IsShader) const {
2695 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2696 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2697 // Note: user SGPRs are handled by the front-end for graphics shaders
2698 // Pad up the used user SGPRs with dead inputs.
2699
2700 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2701 // before enabling architected SGPRs for workgroup IDs.
2702 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2703
2704 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2705 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2706 // rely on it to reach 16 since if we end up having no stack usage, it will
2707 // not really be added.
2708 unsigned NumRequiredSystemSGPRs =
2709 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2710 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2711 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2712 Register Reg = Info.addReservedUserSGPR();
2713 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2714 CCInfo.AllocateReg(Reg);
2715 }
2716 }
2717
2718 if (!HasArchitectedSGPRs) {
2719 if (Info.hasWorkGroupIDX()) {
2720 Register Reg = Info.addWorkGroupIDX();
2721 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2722 CCInfo.AllocateReg(Reg);
2723 }
2724
2725 if (Info.hasWorkGroupIDY()) {
2726 Register Reg = Info.addWorkGroupIDY();
2727 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2728 CCInfo.AllocateReg(Reg);
2729 }
2730
2731 if (Info.hasWorkGroupIDZ()) {
2732 Register Reg = Info.addWorkGroupIDZ();
2733 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2734 CCInfo.AllocateReg(Reg);
2735 }
2736 }
2737
2738 if (Info.hasWorkGroupInfo()) {
2739 Register Reg = Info.addWorkGroupInfo();
2740 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2741 CCInfo.AllocateReg(Reg);
2742 }
2743
2744 if (Info.hasPrivateSegmentWaveByteOffset()) {
2745 // Scratch wave offset passed in system SGPR.
2746 unsigned PrivateSegmentWaveByteOffsetReg;
2747
2748 if (IsShader) {
2749 PrivateSegmentWaveByteOffsetReg =
2750 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2751
2752 // This is true if the scratch wave byte offset doesn't have a fixed
2753 // location.
2754 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2755 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2756 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2757 }
2758 } else
2759 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2760
2761 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2762 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2763 }
2764
2765 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2766 Info.getNumPreloadedSGPRs() >= 16);
2767 }
2768
reservePrivateMemoryRegs(const TargetMachine & TM,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)2769 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2770 MachineFunction &MF,
2771 const SIRegisterInfo &TRI,
2772 SIMachineFunctionInfo &Info) {
2773 // Now that we've figured out where the scratch register inputs are, see if
2774 // should reserve the arguments and use them directly.
2775 MachineFrameInfo &MFI = MF.getFrameInfo();
2776 bool HasStackObjects = MFI.hasStackObjects();
2777 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2778
2779 // Record that we know we have non-spill stack objects so we don't need to
2780 // check all stack objects later.
2781 if (HasStackObjects)
2782 Info.setHasNonSpillStackObjects(true);
2783
2784 // Everything live out of a block is spilled with fast regalloc, so it's
2785 // almost certain that spilling will be required.
2786 if (TM.getOptLevel() == CodeGenOptLevel::None)
2787 HasStackObjects = true;
2788
2789 // For now assume stack access is needed in any callee functions, so we need
2790 // the scratch registers to pass in.
2791 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2792
2793 if (!ST.enableFlatScratch()) {
2794 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2795 // If we have stack objects, we unquestionably need the private buffer
2796 // resource. For the Code Object V2 ABI, this will be the first 4 user
2797 // SGPR inputs. We can reserve those and use them directly.
2798
2799 Register PrivateSegmentBufferReg =
2800 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2801 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2802 } else {
2803 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2804 // We tentatively reserve the last registers (skipping the last registers
2805 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2806 // we'll replace these with the ones immediately after those which were
2807 // really allocated. In the prologue copies will be inserted from the
2808 // argument to these reserved registers.
2809
2810 // Without HSA, relocations are used for the scratch pointer and the
2811 // buffer resource setup is always inserted in the prologue. Scratch wave
2812 // offset is still in an input SGPR.
2813 Info.setScratchRSrcReg(ReservedBufferReg);
2814 }
2815 }
2816
2817 MachineRegisterInfo &MRI = MF.getRegInfo();
2818
2819 // For entry functions we have to set up the stack pointer if we use it,
2820 // whereas non-entry functions get this "for free". This means there is no
2821 // intrinsic advantage to using S32 over S34 in cases where we do not have
2822 // calls but do need a frame pointer (i.e. if we are requested to have one
2823 // because frame pointer elimination is disabled). To keep things simple we
2824 // only ever use S32 as the call ABI stack pointer, and so using it does not
2825 // imply we need a separate frame pointer.
2826 //
2827 // Try to use s32 as the SP, but move it if it would interfere with input
2828 // arguments. This won't work with calls though.
2829 //
2830 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2831 // registers.
2832 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2833 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2834 } else {
2835 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2836
2837 if (MFI.hasCalls())
2838 report_fatal_error("call in graphics shader with too many input SGPRs");
2839
2840 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2841 if (!MRI.isLiveIn(Reg)) {
2842 Info.setStackPtrOffsetReg(Reg);
2843 break;
2844 }
2845 }
2846
2847 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2848 report_fatal_error("failed to find register for SP");
2849 }
2850
2851 // hasFP should be accurate for entry functions even before the frame is
2852 // finalized, because it does not rely on the known stack size, only
2853 // properties like whether variable sized objects are present.
2854 if (ST.getFrameLowering()->hasFP(MF)) {
2855 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2856 }
2857 }
2858
supportSplitCSR(MachineFunction * MF) const2859 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2860 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2861 return !Info->isEntryFunction();
2862 }
2863
initializeSplitCSR(MachineBasicBlock * Entry) const2864 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
2865
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const2866 void SITargetLowering::insertCopiesSplitCSR(
2867 MachineBasicBlock *Entry,
2868 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2869 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2870
2871 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2872 if (!IStart)
2873 return;
2874
2875 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2876 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2877 MachineBasicBlock::iterator MBBI = Entry->begin();
2878 for (const MCPhysReg *I = IStart; *I; ++I) {
2879 const TargetRegisterClass *RC = nullptr;
2880 if (AMDGPU::SReg_64RegClass.contains(*I))
2881 RC = &AMDGPU::SGPR_64RegClass;
2882 else if (AMDGPU::SReg_32RegClass.contains(*I))
2883 RC = &AMDGPU::SGPR_32RegClass;
2884 else
2885 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2886
2887 Register NewVR = MRI->createVirtualRegister(RC);
2888 // Create copy from CSR to a virtual register.
2889 Entry->addLiveIn(*I);
2890 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2891 .addReg(*I);
2892
2893 // Insert the copy-back instructions right before the terminator.
2894 for (auto *Exit : Exits)
2895 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2896 TII->get(TargetOpcode::COPY), *I)
2897 .addReg(NewVR);
2898 }
2899 }
2900
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2901 SDValue SITargetLowering::LowerFormalArguments(
2902 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2903 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2904 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2905 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2906
2907 MachineFunction &MF = DAG.getMachineFunction();
2908 const Function &Fn = MF.getFunction();
2909 FunctionType *FType = MF.getFunction().getFunctionType();
2910 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2911 bool IsError = false;
2912
2913 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2914 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2915 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
2916 IsError = true;
2917 }
2918
2919 SmallVector<ISD::InputArg, 16> Splits;
2920 SmallVector<CCValAssign, 16> ArgLocs;
2921 BitVector Skipped(Ins.size());
2922 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2923 *DAG.getContext());
2924
2925 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2926 bool IsKernel = AMDGPU::isKernel(CallConv);
2927 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2928
2929 if (IsGraphics) {
2930 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2931 assert(!UserSGPRInfo.hasDispatchPtr() &&
2932 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2933 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2934 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2935 (void)UserSGPRInfo;
2936 if (!Subtarget->enableFlatScratch())
2937 assert(!UserSGPRInfo.hasFlatScratchInit());
2938 if ((CallConv != CallingConv::AMDGPU_CS &&
2939 CallConv != CallingConv::AMDGPU_Gfx) ||
2940 !Subtarget->hasArchitectedSGPRs())
2941 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2942 !Info->hasWorkGroupIDZ());
2943 }
2944
2945 if (CallConv == CallingConv::AMDGPU_PS) {
2946 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2947
2948 // At least one interpolation mode must be enabled or else the GPU will
2949 // hang.
2950 //
2951 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2952 // set PSInputAddr, the user wants to enable some bits after the compilation
2953 // based on run-time states. Since we can't know what the final PSInputEna
2954 // will look like, so we shouldn't do anything here and the user should take
2955 // responsibility for the correct programming.
2956 //
2957 // Otherwise, the following restrictions apply:
2958 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2959 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2960 // enabled too.
2961 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2962 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2963 CCInfo.AllocateReg(AMDGPU::VGPR0);
2964 CCInfo.AllocateReg(AMDGPU::VGPR1);
2965 Info->markPSInputAllocated(0);
2966 Info->markPSInputEnabled(0);
2967 }
2968 if (Subtarget->isAmdPalOS()) {
2969 // For isAmdPalOS, the user does not enable some bits after compilation
2970 // based on run-time states; the register values being generated here are
2971 // the final ones set in hardware. Therefore we need to apply the
2972 // workaround to PSInputAddr and PSInputEnable together. (The case where
2973 // a bit is set in PSInputAddr but not PSInputEnable is where the
2974 // frontend set up an input arg for a particular interpolation mode, but
2975 // nothing uses that input arg. Really we should have an earlier pass
2976 // that removes such an arg.)
2977 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2978 if ((PsInputBits & 0x7F) == 0 ||
2979 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2980 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2981 }
2982 } else if (IsKernel) {
2983 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2984 } else {
2985 Splits.append(Ins.begin(), Ins.end());
2986 }
2987
2988 if (IsKernel)
2989 analyzeFormalArgumentsCompute(CCInfo, Ins);
2990
2991 if (IsEntryFunc) {
2992 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2993 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2994 if (IsKernel && Subtarget->hasKernargPreload())
2995 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2996
2997 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2998 } else if (!IsGraphics) {
2999 // For the fixed ABI, pass workitem IDs in the last argument register.
3000 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3001
3002 // FIXME: Sink this into allocateSpecialInputSGPRs
3003 if (!Subtarget->enableFlatScratch())
3004 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3005
3006 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3007 }
3008
3009 if (!IsKernel) {
3010 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3011 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3012 }
3013
3014 SmallVector<SDValue, 16> Chains;
3015
3016 // FIXME: This is the minimum kernel argument alignment. We should improve
3017 // this to the maximum alignment of the arguments.
3018 //
3019 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3020 // kern arg offset.
3021 const Align KernelArgBaseAlign = Align(16);
3022
3023 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
3024 const ISD::InputArg &Arg = Ins[i];
3025 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3026 InVals.push_back(DAG.getPOISON(Arg.VT));
3027 continue;
3028 }
3029
3030 CCValAssign &VA = ArgLocs[ArgIdx++];
3031 MVT VT = VA.getLocVT();
3032
3033 if (IsEntryFunc && VA.isMemLoc()) {
3034 VT = Ins[i].VT;
3035 EVT MemVT = VA.getLocVT();
3036
3037 const uint64_t Offset = VA.getLocMemOffset();
3038 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3039
3040 if (Arg.Flags.isByRef()) {
3041 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3042
3043 const GCNTargetMachine &TM =
3044 static_cast<const GCNTargetMachine &>(getTargetMachine());
3045 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3046 Arg.Flags.getPointerAddrSpace())) {
3047 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
3048 Arg.Flags.getPointerAddrSpace());
3049 }
3050
3051 InVals.push_back(Ptr);
3052 continue;
3053 }
3054
3055 SDValue NewArg;
3056 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3057 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3058 // In this case the argument is packed into the previous preload SGPR.
3059 int64_t AlignDownOffset = alignDown(Offset, 4);
3060 int64_t OffsetDiff = Offset - AlignDownOffset;
3061 EVT IntVT = MemVT.changeTypeToInteger();
3062
3063 const SIMachineFunctionInfo *Info =
3064 MF.getInfo<SIMachineFunctionInfo>();
3065 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3066 Register Reg =
3067 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3068
3069 assert(Reg);
3070 Register VReg = MRI.getLiveInVirtReg(Reg);
3071 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3072
3073 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3074 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3075
3076 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3077 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3078 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3079 Ins[i].Flags.isSExt(), &Ins[i]);
3080
3081 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3082 } else {
3083 const SIMachineFunctionInfo *Info =
3084 MF.getInfo<SIMachineFunctionInfo>();
3085 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3086 const SmallVectorImpl<MCRegister> &PreloadRegs =
3087 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3088
3089 SDValue Copy;
3090 if (PreloadRegs.size() == 1) {
3091 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3092 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3093 NewArg = DAG.getCopyFromReg(
3094 Chain, DL, VReg,
3095 EVT::getIntegerVT(*DAG.getContext(),
3096 TRI->getRegSizeInBits(*RC)));
3097
3098 } else {
3099 // If the kernarg alignment does not match the alignment of the SGPR
3100 // tuple RC that can accommodate this argument, it will be built up
3101 // via copies from from the individual SGPRs that the argument was
3102 // preloaded to.
3103 SmallVector<SDValue, 4> Elts;
3104 for (auto Reg : PreloadRegs) {
3105 Register VReg = MRI.getLiveInVirtReg(Reg);
3106 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3107 Elts.push_back(Copy);
3108 }
3109 NewArg =
3110 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3111 PreloadRegs.size()),
3112 DL, Elts);
3113 }
3114
3115 // If the argument was preloaded to multiple consecutive 32-bit
3116 // registers because of misalignment between addressable SGPR tuples
3117 // and the argument size, we can still assume that because of kernarg
3118 // segment alignment restrictions that NewArg's size is the same as
3119 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3120 // truncate since we cannot preload to less than a single SGPR and the
3121 // MemVT may be smaller.
3122 EVT MemVTInt =
3123 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
3124 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3125 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3126
3127 NewArg = DAG.getBitcast(MemVT, NewArg);
3128 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3129 Ins[i].Flags.isSExt(), &Ins[i]);
3130 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3131 }
3132 } else {
3133 // Hidden arguments that are in the kernel signature must be preloaded
3134 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3135 // the argument list and is not preloaded.
3136 if (Arg.isOrigArg()) {
3137 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3138 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3139 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3140 *OrigArg->getParent(),
3141 "hidden argument in kernel signature was not preloaded",
3142 DL.getDebugLoc()));
3143 }
3144 }
3145
3146 NewArg =
3147 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3148 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3149 }
3150 Chains.push_back(NewArg.getValue(1));
3151
3152 auto *ParamTy =
3153 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3154 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3155 ParamTy &&
3156 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3157 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3158 // On SI local pointers are just offsets into LDS, so they are always
3159 // less than 16-bits. On CI and newer they could potentially be
3160 // real pointers, so we can't guarantee their size.
3161 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3162 DAG.getValueType(MVT::i16));
3163 }
3164
3165 InVals.push_back(NewArg);
3166 continue;
3167 }
3168 if (!IsEntryFunc && VA.isMemLoc()) {
3169 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3170 InVals.push_back(Val);
3171 if (!Arg.Flags.isByVal())
3172 Chains.push_back(Val.getValue(1));
3173 continue;
3174 }
3175
3176 assert(VA.isRegLoc() && "Parameter must be in a register!");
3177
3178 Register Reg = VA.getLocReg();
3179 const TargetRegisterClass *RC = nullptr;
3180 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3181 RC = &AMDGPU::VGPR_32RegClass;
3182 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3183 RC = &AMDGPU::SGPR_32RegClass;
3184 else
3185 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3186 EVT ValVT = VA.getValVT();
3187
3188 Reg = MF.addLiveIn(Reg, RC);
3189 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3190
3191 if (Arg.Flags.isSRet()) {
3192 // The return object should be reasonably addressable.
3193
3194 // FIXME: This helps when the return is a real sret. If it is a
3195 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3196 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3197 unsigned NumBits =
3198 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3199 Val = DAG.getNode(
3200 ISD::AssertZext, DL, VT, Val,
3201 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3202 }
3203
3204 // If this is an 8 or 16-bit value, it is really passed promoted
3205 // to 32 bits. Insert an assert[sz]ext to capture this, then
3206 // truncate to the right size.
3207 switch (VA.getLocInfo()) {
3208 case CCValAssign::Full:
3209 break;
3210 case CCValAssign::BCvt:
3211 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3212 break;
3213 case CCValAssign::SExt:
3214 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3215 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3216 break;
3217 case CCValAssign::ZExt:
3218 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3219 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3220 break;
3221 case CCValAssign::AExt:
3222 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3223 break;
3224 default:
3225 llvm_unreachable("Unknown loc info!");
3226 }
3227
3228 InVals.push_back(Val);
3229 }
3230
3231 // Start adding system SGPRs.
3232 if (IsEntryFunc)
3233 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3234
3235 // DAG.getPass() returns nullptr when using new pass manager.
3236 // TODO: Use DAG.getMFAM() to access analysis result.
3237 if (DAG.getPass()) {
3238 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3239 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3240 }
3241
3242 unsigned StackArgSize = CCInfo.getStackSize();
3243 Info->setBytesInStackArgArea(StackArgSize);
3244
3245 return Chains.empty() ? Chain
3246 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3247 }
3248
3249 // TODO: If return values can't fit in registers, we should return as many as
3250 // possible in registers before passing on stack.
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context,const Type * RetTy) const3251 bool SITargetLowering::CanLowerReturn(
3252 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3253 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3254 const Type *RetTy) const {
3255 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3256 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3257 // for shaders. Vector types should be explicitly handled by CC.
3258 if (AMDGPU::isEntryFunctionCC(CallConv))
3259 return true;
3260
3261 SmallVector<CCValAssign, 16> RVLocs;
3262 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3263 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3264 return false;
3265
3266 // We must use the stack if return would require unavailable registers.
3267 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3268 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3269 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3270 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3271 return false;
3272
3273 return true;
3274 }
3275
3276 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & DL,SelectionDAG & DAG) const3277 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3278 bool isVarArg,
3279 const SmallVectorImpl<ISD::OutputArg> &Outs,
3280 const SmallVectorImpl<SDValue> &OutVals,
3281 const SDLoc &DL, SelectionDAG &DAG) const {
3282 MachineFunction &MF = DAG.getMachineFunction();
3283 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3284 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3285
3286 if (AMDGPU::isKernel(CallConv)) {
3287 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3288 OutVals, DL, DAG);
3289 }
3290
3291 bool IsShader = AMDGPU::isShader(CallConv);
3292
3293 Info->setIfReturnsVoid(Outs.empty());
3294 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3295
3296 // CCValAssign - represent the assignment of the return value to a location.
3297 SmallVector<CCValAssign, 48> RVLocs;
3298
3299 // CCState - Info about the registers and stack slots.
3300 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3301 *DAG.getContext());
3302
3303 // Analyze outgoing return values.
3304 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3305
3306 SDValue Glue;
3307 SmallVector<SDValue, 48> RetOps;
3308 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3309
3310 SDValue ReadFirstLane =
3311 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3312 // Copy the result values into the output registers.
3313 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3314 ++I, ++RealRVLocIdx) {
3315 CCValAssign &VA = RVLocs[I];
3316 assert(VA.isRegLoc() && "Can only return in registers!");
3317 // TODO: Partially return in registers if return values don't fit.
3318 SDValue Arg = OutVals[RealRVLocIdx];
3319
3320 // Copied from other backends.
3321 switch (VA.getLocInfo()) {
3322 case CCValAssign::Full:
3323 break;
3324 case CCValAssign::BCvt:
3325 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3326 break;
3327 case CCValAssign::SExt:
3328 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3329 break;
3330 case CCValAssign::ZExt:
3331 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3332 break;
3333 case CCValAssign::AExt:
3334 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3335 break;
3336 default:
3337 llvm_unreachable("Unknown loc info!");
3338 }
3339 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3340 Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
3341 ReadFirstLane, Arg);
3342 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3343 Glue = Chain.getValue(1);
3344 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3345 }
3346
3347 // FIXME: Does sret work properly?
3348 if (!Info->isEntryFunction()) {
3349 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3350 const MCPhysReg *I =
3351 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3352 if (I) {
3353 for (; *I; ++I) {
3354 if (AMDGPU::SReg_64RegClass.contains(*I))
3355 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3356 else if (AMDGPU::SReg_32RegClass.contains(*I))
3357 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3358 else
3359 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3360 }
3361 }
3362 }
3363
3364 // Update chain and glue.
3365 RetOps[0] = Chain;
3366 if (Glue.getNode())
3367 RetOps.push_back(Glue);
3368
3369 unsigned Opc = AMDGPUISD::ENDPGM;
3370 if (!IsWaveEnd)
3371 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3372 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3373 }
3374
LowerCallResult(SDValue Chain,SDValue InGlue,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,bool IsThisReturn,SDValue ThisVal) const3375 SDValue SITargetLowering::LowerCallResult(
3376 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3377 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3378 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3379 SDValue ThisVal) const {
3380 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3381
3382 // Assign locations to each value returned by this call.
3383 SmallVector<CCValAssign, 16> RVLocs;
3384 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3385 *DAG.getContext());
3386 CCInfo.AnalyzeCallResult(Ins, RetCC);
3387
3388 // Copy all of the result registers out of their specified physreg.
3389 for (CCValAssign VA : RVLocs) {
3390 SDValue Val;
3391
3392 if (VA.isRegLoc()) {
3393 Val =
3394 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3395 Chain = Val.getValue(1);
3396 InGlue = Val.getValue(2);
3397 } else if (VA.isMemLoc()) {
3398 report_fatal_error("TODO: return values in memory");
3399 } else
3400 llvm_unreachable("unknown argument location type");
3401
3402 switch (VA.getLocInfo()) {
3403 case CCValAssign::Full:
3404 break;
3405 case CCValAssign::BCvt:
3406 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3407 break;
3408 case CCValAssign::ZExt:
3409 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3410 DAG.getValueType(VA.getValVT()));
3411 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3412 break;
3413 case CCValAssign::SExt:
3414 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3415 DAG.getValueType(VA.getValVT()));
3416 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3417 break;
3418 case CCValAssign::AExt:
3419 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3420 break;
3421 default:
3422 llvm_unreachable("Unknown loc info!");
3423 }
3424
3425 InVals.push_back(Val);
3426 }
3427
3428 return Chain;
3429 }
3430
3431 // Add code to pass special inputs required depending on used features separate
3432 // from the explicit user arguments present in the IR.
passSpecialInputs(CallLoweringInfo & CLI,CCState & CCInfo,const SIMachineFunctionInfo & Info,SmallVectorImpl<std::pair<unsigned,SDValue>> & RegsToPass,SmallVectorImpl<SDValue> & MemOpChains,SDValue Chain) const3433 void SITargetLowering::passSpecialInputs(
3434 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3435 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3436 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3437 // If we don't have a call site, this was a call inserted by
3438 // legalization. These can never use special inputs.
3439 if (!CLI.CB)
3440 return;
3441
3442 SelectionDAG &DAG = CLI.DAG;
3443 const SDLoc &DL = CLI.DL;
3444 const Function &F = DAG.getMachineFunction().getFunction();
3445
3446 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3447 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3448
3449 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3450 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3451 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3452 // DAG.getPass() returns nullptr when using new pass manager.
3453 // TODO: Use DAG.getMFAM() to access analysis result.
3454 if (DAG.getPass()) {
3455 auto &ArgUsageInfo =
3456 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3457 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3458 }
3459 }
3460
3461 // TODO: Unify with private memory register handling. This is complicated by
3462 // the fact that at least in kernels, the input argument is not necessarily
3463 // in the same location as the input.
3464 // clang-format off
3465 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3466 StringLiteral> ImplicitAttrs[] = {
3467 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3468 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3469 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3470 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3471 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3472 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3473 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3474 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3475 };
3476 // clang-format on
3477
3478 for (auto [InputID, Attr] : ImplicitAttrs) {
3479 // If the callee does not use the attribute value, skip copying the value.
3480 if (CLI.CB->hasFnAttr(Attr))
3481 continue;
3482
3483 const auto [OutgoingArg, ArgRC, ArgTy] =
3484 CalleeArgInfo->getPreloadedValue(InputID);
3485 if (!OutgoingArg)
3486 continue;
3487
3488 const auto [IncomingArg, IncomingArgRC, Ty] =
3489 CallerArgInfo.getPreloadedValue(InputID);
3490 assert(IncomingArgRC == ArgRC);
3491
3492 // All special arguments are ints for now.
3493 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3494 SDValue InputReg;
3495
3496 if (IncomingArg) {
3497 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3498 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3499 // The implicit arg ptr is special because it doesn't have a corresponding
3500 // input for kernels, and is computed from the kernarg segment pointer.
3501 InputReg = getImplicitArgPtr(DAG, DL);
3502 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3503 std::optional<uint32_t> Id =
3504 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3505 if (Id.has_value()) {
3506 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3507 } else {
3508 InputReg = DAG.getPOISON(ArgVT);
3509 }
3510 } else {
3511 // We may have proven the input wasn't needed, although the ABI is
3512 // requiring it. We just need to allocate the register appropriately.
3513 InputReg = DAG.getPOISON(ArgVT);
3514 }
3515
3516 if (OutgoingArg->isRegister()) {
3517 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3518 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3519 report_fatal_error("failed to allocate implicit input argument");
3520 } else {
3521 unsigned SpecialArgOffset =
3522 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3523 SDValue ArgStore =
3524 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3525 MemOpChains.push_back(ArgStore);
3526 }
3527 }
3528
3529 // Pack workitem IDs into a single register or pass it as is if already
3530 // packed.
3531
3532 auto [OutgoingArg, ArgRC, Ty] =
3533 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3534 if (!OutgoingArg)
3535 std::tie(OutgoingArg, ArgRC, Ty) =
3536 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3537 if (!OutgoingArg)
3538 std::tie(OutgoingArg, ArgRC, Ty) =
3539 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3540 if (!OutgoingArg)
3541 return;
3542
3543 const ArgDescriptor *IncomingArgX = std::get<0>(
3544 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3545 const ArgDescriptor *IncomingArgY = std::get<0>(
3546 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3547 const ArgDescriptor *IncomingArgZ = std::get<0>(
3548 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3549
3550 SDValue InputReg;
3551 SDLoc SL;
3552
3553 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3554 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3555 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3556
3557 // If incoming ids are not packed we need to pack them.
3558 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3559 NeedWorkItemIDX) {
3560 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3561 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3562 } else {
3563 InputReg = DAG.getConstant(0, DL, MVT::i32);
3564 }
3565 }
3566
3567 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3568 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3569 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3570 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3571 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3572 InputReg = InputReg.getNode()
3573 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3574 : Y;
3575 }
3576
3577 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3578 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3579 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3580 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3581 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3582 InputReg = InputReg.getNode()
3583 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3584 : Z;
3585 }
3586
3587 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3588 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3589 // We're in a situation where the outgoing function requires the workitem
3590 // ID, but the calling function does not have it (e.g a graphics function
3591 // calling a C calling convention function). This is illegal, but we need
3592 // to produce something.
3593 InputReg = DAG.getPOISON(MVT::i32);
3594 } else {
3595 // Workitem ids are already packed, any of present incoming arguments
3596 // will carry all required fields.
3597 ArgDescriptor IncomingArg =
3598 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3599 : IncomingArgY ? *IncomingArgY
3600 : *IncomingArgZ,
3601 ~0u);
3602 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3603 }
3604 }
3605
3606 if (OutgoingArg->isRegister()) {
3607 if (InputReg)
3608 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3609
3610 CCInfo.AllocateReg(OutgoingArg->getRegister());
3611 } else {
3612 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3613 if (InputReg) {
3614 SDValue ArgStore =
3615 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3616 MemOpChains.push_back(ArgStore);
3617 }
3618 }
3619 }
3620
isEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const3621 bool SITargetLowering::isEligibleForTailCallOptimization(
3622 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3623 const SmallVectorImpl<ISD::OutputArg> &Outs,
3624 const SmallVectorImpl<SDValue> &OutVals,
3625 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3626 if (AMDGPU::isChainCC(CalleeCC))
3627 return true;
3628
3629 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3630 return false;
3631
3632 // For a divergent call target, we need to do a waterfall loop over the
3633 // possible callees which precludes us from using a simple jump.
3634 if (Callee->isDivergent())
3635 return false;
3636
3637 MachineFunction &MF = DAG.getMachineFunction();
3638 const Function &CallerF = MF.getFunction();
3639 CallingConv::ID CallerCC = CallerF.getCallingConv();
3640 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3641 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3642
3643 // Kernels aren't callable, and don't have a live in return address so it
3644 // doesn't make sense to do a tail call with entry functions.
3645 if (!CallerPreserved)
3646 return false;
3647
3648 bool CCMatch = CallerCC == CalleeCC;
3649
3650 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3651 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3652 return true;
3653 return false;
3654 }
3655
3656 // TODO: Can we handle var args?
3657 if (IsVarArg)
3658 return false;
3659
3660 for (const Argument &Arg : CallerF.args()) {
3661 if (Arg.hasByValAttr())
3662 return false;
3663 }
3664
3665 LLVMContext &Ctx = *DAG.getContext();
3666
3667 // Check that the call results are passed in the same way.
3668 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3669 CCAssignFnForCall(CalleeCC, IsVarArg),
3670 CCAssignFnForCall(CallerCC, IsVarArg)))
3671 return false;
3672
3673 // The callee has to preserve all registers the caller needs to preserve.
3674 if (!CCMatch) {
3675 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3676 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3677 return false;
3678 }
3679
3680 // Nothing more to check if the callee is taking no arguments.
3681 if (Outs.empty())
3682 return true;
3683
3684 SmallVector<CCValAssign, 16> ArgLocs;
3685 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3686
3687 // FIXME: We are not allocating special input registers, so we will be
3688 // deciding based on incorrect register assignments.
3689 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3690
3691 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3692 // If the stack arguments for this call do not fit into our own save area then
3693 // the call cannot be made tail.
3694 // TODO: Is this really necessary?
3695 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3696 return false;
3697
3698 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3699 // FIXME: What about inreg arguments that end up passed in memory?
3700 if (!CCVA.isRegLoc())
3701 continue;
3702
3703 // If we are passing an argument in an SGPR, and the value is divergent,
3704 // this call requires a waterfall loop.
3705 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3706 LLVM_DEBUG(
3707 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3708 << printReg(CCVA.getLocReg(), TRI) << '\n');
3709 return false;
3710 }
3711 }
3712
3713 const MachineRegisterInfo &MRI = MF.getRegInfo();
3714 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3715 }
3716
mayBeEmittedAsTailCall(const CallInst * CI) const3717 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3718 if (!CI->isTailCall())
3719 return false;
3720
3721 const Function *ParentFn = CI->getParent()->getParent();
3722 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3723 return false;
3724 return true;
3725 }
3726
3727 namespace {
3728 // Chain calls have special arguments that we need to handle. These are
3729 // tagging along at the end of the arguments list(s), after the SGPR and VGPR
3730 // arguments (index 0 and 1 respectively).
3731 enum ChainCallArgIdx {
3732 Exec = 2,
3733 Flags,
3734 NumVGPRs,
3735 FallbackExec,
3736 FallbackCallee
3737 };
3738 } // anonymous namespace
3739
3740 // The wave scratch offset register is used as the global base pointer.
LowerCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const3741 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3742 SmallVectorImpl<SDValue> &InVals) const {
3743 CallingConv::ID CallConv = CLI.CallConv;
3744 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3745
3746 SelectionDAG &DAG = CLI.DAG;
3747
3748 const SDLoc &DL = CLI.DL;
3749 SDValue Chain = CLI.Chain;
3750 SDValue Callee = CLI.Callee;
3751
3752 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3753 bool UsesDynamicVGPRs = false;
3754 if (IsChainCallConv) {
3755 // The last arguments should be the value that we need to put in EXEC,
3756 // followed by the flags and any other arguments with special meanings.
3757 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3758 // we don't treat them like the "real" arguments.
3759 auto RequestedExecIt =
3760 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3761 return Arg.OrigArgIndex == 2;
3762 });
3763 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3764
3765 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3766 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3767 CLI.OutVals.end());
3768 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3769
3770 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3771 "Haven't popped all the special args");
3772
3773 TargetLowering::ArgListEntry RequestedExecArg =
3774 CLI.Args[ChainCallArgIdx::Exec];
3775 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3776 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3777
3778 // Convert constants into TargetConstants, so they become immediate operands
3779 // instead of being selected into S_MOV.
3780 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3781 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3782 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3783 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3784 } else
3785 ChainCallSpecialArgs.push_back(Arg.Node);
3786 };
3787
3788 PushNodeOrTargetConstant(RequestedExecArg);
3789
3790 // Process any other special arguments depending on the value of the flags.
3791 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3792
3793 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3794 if (FlagsValue.isZero()) {
3795 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3796 return lowerUnhandledCall(CLI, InVals,
3797 "no additional args allowed if flags == 0");
3798 } else if (FlagsValue.isOneBitSet(0)) {
3799 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3800 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3801 }
3802
3803 if (!Subtarget->isWave32()) {
3804 return lowerUnhandledCall(
3805 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3806 }
3807
3808 UsesDynamicVGPRs = true;
3809 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3810 CLI.Args.end(), PushNodeOrTargetConstant);
3811 }
3812 }
3813
3814 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3815 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3816 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3817 bool &IsTailCall = CLI.IsTailCall;
3818 bool IsVarArg = CLI.IsVarArg;
3819 bool IsSibCall = false;
3820 MachineFunction &MF = DAG.getMachineFunction();
3821
3822 if (Callee.isUndef() || isNullConstant(Callee)) {
3823 if (!CLI.IsTailCall) {
3824 for (ISD::InputArg &Arg : CLI.Ins)
3825 InVals.push_back(DAG.getPOISON(Arg.VT));
3826 }
3827
3828 return Chain;
3829 }
3830
3831 if (IsVarArg) {
3832 return lowerUnhandledCall(CLI, InVals,
3833 "unsupported call to variadic function ");
3834 }
3835
3836 if (!CLI.CB)
3837 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
3838
3839 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3840 return lowerUnhandledCall(CLI, InVals,
3841 "unsupported required tail call to function ");
3842 }
3843
3844 if (IsTailCall) {
3845 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3846 Outs, OutVals, Ins, DAG);
3847 if (!IsTailCall &&
3848 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3849 report_fatal_error("failed to perform tail call elimination on a call "
3850 "site marked musttail or on llvm.amdgcn.cs.chain");
3851 }
3852
3853 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3854
3855 // A sibling call is one where we're under the usual C ABI and not planning
3856 // to change that but can still do a tail call:
3857 if (!TailCallOpt && IsTailCall)
3858 IsSibCall = true;
3859
3860 if (IsTailCall)
3861 ++NumTailCalls;
3862 }
3863
3864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3865 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3866 SmallVector<SDValue, 8> MemOpChains;
3867
3868 // Analyze operands of the call, assigning locations to each operand.
3869 SmallVector<CCValAssign, 16> ArgLocs;
3870 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3871 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3872
3873 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3874 // With a fixed ABI, allocate fixed registers before user arguments.
3875 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3876 }
3877
3878 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3879
3880 // Get a count of how many bytes are to be pushed on the stack.
3881 unsigned NumBytes = CCInfo.getStackSize();
3882
3883 if (IsSibCall) {
3884 // Since we're not changing the ABI to make this a tail call, the memory
3885 // operands are already available in the caller's incoming argument space.
3886 NumBytes = 0;
3887 }
3888
3889 // FPDiff is the byte offset of the call's argument area from the callee's.
3890 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3891 // by this amount for a tail call. In a sibling call it must be 0 because the
3892 // caller will deallocate the entire stack and the callee still expects its
3893 // arguments to begin at SP+0. Completely unused for non-tail calls.
3894 int32_t FPDiff = 0;
3895 MachineFrameInfo &MFI = MF.getFrameInfo();
3896 auto *TRI = Subtarget->getRegisterInfo();
3897
3898 // Adjust the stack pointer for the new arguments...
3899 // These operations are automatically eliminated by the prolog/epilog pass
3900 if (!IsSibCall)
3901 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3902
3903 if (!IsSibCall || IsChainCallConv) {
3904 if (!Subtarget->enableFlatScratch()) {
3905 SmallVector<SDValue, 4> CopyFromChains;
3906
3907 // In the HSA case, this should be an identity copy.
3908 SDValue ScratchRSrcReg =
3909 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3910 RegsToPass.emplace_back(IsChainCallConv
3911 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3912 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3913 ScratchRSrcReg);
3914 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3915 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3916 }
3917 }
3918
3919 const unsigned NumSpecialInputs = RegsToPass.size();
3920
3921 MVT PtrVT = MVT::i32;
3922
3923 // Walk the register/memloc assignments, inserting copies/loads.
3924 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3925 CCValAssign &VA = ArgLocs[i];
3926 SDValue Arg = OutVals[i];
3927
3928 // Promote the value if needed.
3929 switch (VA.getLocInfo()) {
3930 case CCValAssign::Full:
3931 break;
3932 case CCValAssign::BCvt:
3933 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3934 break;
3935 case CCValAssign::ZExt:
3936 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3937 break;
3938 case CCValAssign::SExt:
3939 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3940 break;
3941 case CCValAssign::AExt:
3942 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3943 break;
3944 case CCValAssign::FPExt:
3945 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3946 break;
3947 default:
3948 llvm_unreachable("Unknown loc info!");
3949 }
3950
3951 if (VA.isRegLoc()) {
3952 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3953 } else {
3954 assert(VA.isMemLoc());
3955
3956 SDValue DstAddr;
3957 MachinePointerInfo DstInfo;
3958
3959 unsigned LocMemOffset = VA.getLocMemOffset();
3960 int32_t Offset = LocMemOffset;
3961
3962 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3963 MaybeAlign Alignment;
3964
3965 if (IsTailCall) {
3966 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3967 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3968 : VA.getValVT().getStoreSize();
3969
3970 // FIXME: We can have better than the minimum byval required alignment.
3971 Alignment =
3972 Flags.isByVal()
3973 ? Flags.getNonZeroByValAlign()
3974 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3975
3976 Offset = Offset + FPDiff;
3977 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3978
3979 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3980 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3981
3982 // Make sure any stack arguments overlapping with where we're storing
3983 // are loaded before this eventual operation. Otherwise they'll be
3984 // clobbered.
3985
3986 // FIXME: Why is this really necessary? This seems to just result in a
3987 // lot of code to copy the stack and write them back to the same
3988 // locations, which are supposed to be immutable?
3989 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3990 } else {
3991 // Stores to the argument stack area are relative to the stack pointer.
3992 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3993 MVT::i32);
3994 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3995 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3996 Alignment =
3997 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3998 }
3999
4000 if (Outs[i].Flags.isByVal()) {
4001 SDValue SizeNode =
4002 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4003 SDValue Cpy =
4004 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4005 Outs[i].Flags.getNonZeroByValAlign(),
4006 /*isVol = */ false, /*AlwaysInline = */ true,
4007 /*CI=*/nullptr, std::nullopt, DstInfo,
4008 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4009
4010 MemOpChains.push_back(Cpy);
4011 } else {
4012 SDValue Store =
4013 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4014 MemOpChains.push_back(Store);
4015 }
4016 }
4017 }
4018
4019 if (!MemOpChains.empty())
4020 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4021
4022 SDValue ReadFirstLaneID =
4023 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4024
4025 SDValue TokenGlue;
4026 if (CLI.ConvergenceControlToken) {
4027 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4028 CLI.ConvergenceControlToken);
4029 }
4030
4031 // Build a sequence of copy-to-reg nodes chained together with token chain
4032 // and flag operands which copy the outgoing args into the appropriate regs.
4033 SDValue InGlue;
4034
4035 unsigned ArgIdx = 0;
4036 for (auto [Reg, Val] : RegsToPass) {
4037 if (ArgIdx++ >= NumSpecialInputs &&
4038 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4039 // For chain calls, the inreg arguments are required to be
4040 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4041 // they are uniform.
4042 //
4043 // For other calls, if an inreg arguments is known to be uniform,
4044 // speculatively insert a readfirstlane in case it is in a VGPR.
4045 //
4046 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4047 // value, so let that continue to produce invalid code.
4048
4049 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4050 if (TokenGlue)
4051 ReadfirstlaneArgs.push_back(TokenGlue);
4052 Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
4053 ReadfirstlaneArgs);
4054 }
4055
4056 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4057 InGlue = Chain.getValue(1);
4058 }
4059
4060 // We don't usually want to end the call-sequence here because we would tidy
4061 // the frame up *after* the call, however in the ABI-changing tail-call case
4062 // we've carefully laid out the parameters so that when sp is reset they'll be
4063 // in the correct location.
4064 if (IsTailCall && !IsSibCall) {
4065 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4066 InGlue = Chain.getValue(1);
4067 }
4068
4069 std::vector<SDValue> Ops({Chain});
4070
4071 // Add a redundant copy of the callee global which will not be legalized, as
4072 // we need direct access to the callee later.
4073 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
4074 const GlobalValue *GV = GSD->getGlobal();
4075 Ops.push_back(Callee);
4076 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4077 } else {
4078 if (IsTailCall) {
4079 // isEligibleForTailCallOptimization considered whether the call target is
4080 // divergent, but we may still end up with a uniform value in a VGPR.
4081 // Insert a readfirstlane just in case.
4082 SDValue ReadFirstLaneID =
4083 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4084
4085 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4086 if (TokenGlue)
4087 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4088 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4089 ReadfirstlaneArgs);
4090 }
4091
4092 Ops.push_back(Callee);
4093 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4094 }
4095
4096 if (IsTailCall) {
4097 // Each tail call may have to adjust the stack by a different amount, so
4098 // this information must travel along with the operation for eventual
4099 // consumption by emitEpilogue.
4100 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4101 }
4102
4103 if (IsChainCallConv)
4104 llvm::append_range(Ops, ChainCallSpecialArgs);
4105
4106 // Add argument registers to the end of the list so that they are known live
4107 // into the call.
4108 for (auto &[Reg, Val] : RegsToPass)
4109 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4110
4111 // Add a register mask operand representing the call-preserved registers.
4112 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4113 assert(Mask && "Missing call preserved mask for calling convention");
4114 Ops.push_back(DAG.getRegisterMask(Mask));
4115
4116 if (SDValue Token = CLI.ConvergenceControlToken) {
4117 SmallVector<SDValue, 2> GlueOps;
4118 GlueOps.push_back(Token);
4119 if (InGlue)
4120 GlueOps.push_back(InGlue);
4121
4122 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4123 MVT::Glue, GlueOps),
4124 0);
4125 }
4126
4127 if (InGlue)
4128 Ops.push_back(InGlue);
4129
4130 // If we're doing a tall call, use a TC_RETURN here rather than an
4131 // actual call instruction.
4132 if (IsTailCall) {
4133 MFI.setHasTailCall();
4134 unsigned OPC = AMDGPUISD::TC_RETURN;
4135 switch (CallConv) {
4136 case CallingConv::AMDGPU_Gfx:
4137 OPC = AMDGPUISD::TC_RETURN_GFX;
4138 break;
4139 case CallingConv::AMDGPU_CS_Chain:
4140 case CallingConv::AMDGPU_CS_ChainPreserve:
4141 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4142 : AMDGPUISD::TC_RETURN_CHAIN;
4143 break;
4144 }
4145
4146 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4147 }
4148
4149 // Returns a chain and a flag for retval copy to use.
4150 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4151 Chain = Call.getValue(0);
4152 InGlue = Call.getValue(1);
4153
4154 uint64_t CalleePopBytes = NumBytes;
4155 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4156 if (!Ins.empty())
4157 InGlue = Chain.getValue(1);
4158
4159 // Handle result values, copying them out of physregs into vregs that we
4160 // return.
4161 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4162 InVals, /*IsThisReturn=*/false, SDValue());
4163 }
4164
4165 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4166 // except for:
4167 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4168 // 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const4169 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4170 SelectionDAG &DAG) const {
4171 const MachineFunction &MF = DAG.getMachineFunction();
4172 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4173
4174 SDLoc dl(Op);
4175 EVT VT = Op.getValueType();
4176 SDValue Chain = Op.getOperand(0);
4177 Register SPReg = Info->getStackPtrOffsetReg();
4178
4179 // Chain the dynamic stack allocation so that it doesn't modify the stack
4180 // pointer when other instructions are using the stack.
4181 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4182
4183 SDValue Size = Op.getOperand(1);
4184 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4185 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4186
4187 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4188 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4189 "Stack grows upwards for AMDGPU");
4190
4191 Chain = BaseAddr.getValue(1);
4192 Align StackAlign = TFL->getStackAlign();
4193 if (Alignment > StackAlign) {
4194 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4195 << Subtarget->getWavefrontSizeLog2();
4196 uint64_t StackAlignMask = ScaledAlignment - 1;
4197 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4198 DAG.getConstant(StackAlignMask, dl, VT));
4199 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4200 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4201 }
4202
4203 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4204 SDValue NewSP;
4205 if (isa<ConstantSDNode>(Size)) {
4206 // For constant sized alloca, scale alloca size by wave-size
4207 SDValue ScaledSize = DAG.getNode(
4208 ISD::SHL, dl, VT, Size,
4209 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4210 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4211 } else {
4212 // For dynamic sized alloca, perform wave-wide reduction to get max of
4213 // alloca size(divergent) and then scale it by wave-size
4214 SDValue WaveReduction =
4215 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4216 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4217 Size, DAG.getConstant(0, dl, MVT::i32));
4218 SDValue ScaledSize = DAG.getNode(
4219 ISD::SHL, dl, VT, Size,
4220 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4221 NewSP =
4222 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4223 SDValue ReadFirstLaneID =
4224 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4225 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4226 NewSP);
4227 }
4228
4229 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4230 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4231
4232 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4233 }
4234
LowerSTACKSAVE(SDValue Op,SelectionDAG & DAG) const4235 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4236 if (Op.getValueType() != MVT::i32)
4237 return Op; // Defer to cannot select error.
4238
4239 Register SP = getStackPointerRegisterToSaveRestore();
4240 SDLoc SL(Op);
4241
4242 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4243
4244 // Convert from wave uniform to swizzled vector address. This should protect
4245 // from any edge cases where the stacksave result isn't directly used with
4246 // stackrestore.
4247 SDValue VectorAddress =
4248 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4249 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4250 }
4251
lowerGET_ROUNDING(SDValue Op,SelectionDAG & DAG) const4252 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4253 SelectionDAG &DAG) const {
4254 SDLoc SL(Op);
4255 assert(Op.getValueType() == MVT::i32);
4256
4257 uint32_t BothRoundHwReg =
4258 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4259 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4260
4261 SDValue IntrinID =
4262 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4263 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4264 Op.getOperand(0), IntrinID, GetRoundBothImm);
4265
4266 // There are two rounding modes, one for f32 and one for f64/f16. We only
4267 // report in the standard value range if both are the same.
4268 //
4269 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4270 // ties away from zero is not supported, and the other values are rotated by
4271 // 1.
4272 //
4273 // If the two rounding modes are not the same, report a target defined value.
4274
4275 // Mode register rounding mode fields:
4276 //
4277 // [1:0] Single-precision round mode.
4278 // [3:2] Double/Half-precision round mode.
4279 //
4280 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4281 //
4282 // Hardware Spec
4283 // Toward-0 3 0
4284 // Nearest Even 0 1
4285 // +Inf 1 2
4286 // -Inf 2 3
4287 // NearestAway0 N/A 4
4288 //
4289 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4290 // table we can index by the raw hardware mode.
4291 //
4292 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4293
4294 SDValue BitTable =
4295 DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
4296
4297 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4298 SDValue RoundModeTimesNumBits =
4299 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4300
4301 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4302 // knew only one mode was demanded.
4303 SDValue TableValue =
4304 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4305 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4306
4307 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4308 SDValue TableEntry =
4309 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4310
4311 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4312 // if it's an extended value.
4313 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4314 SDValue IsStandardValue =
4315 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4316 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4317 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4318 TableEntry, EnumOffset);
4319
4320 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4321 }
4322
lowerSET_ROUNDING(SDValue Op,SelectionDAG & DAG) const4323 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4324 SelectionDAG &DAG) const {
4325 SDLoc SL(Op);
4326
4327 SDValue NewMode = Op.getOperand(1);
4328 assert(NewMode.getValueType() == MVT::i32);
4329
4330 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4331 // hardware MODE.fp_round values.
4332 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4333 uint32_t ClampedVal = std::min(
4334 static_cast<uint32_t>(ConstMode->getZExtValue()),
4335 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4336 NewMode = DAG.getConstant(
4337 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4338 } else {
4339 // If we know the input can only be one of the supported standard modes in
4340 // the range 0-3, we can use a simplified mapping to hardware values.
4341 KnownBits KB = DAG.computeKnownBits(NewMode);
4342 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4343 // The supported standard values are 0-3. The extended values start at 8. We
4344 // need to offset by 4 if the value is in the extended range.
4345
4346 if (UseReducedTable) {
4347 // Truncate to the low 32-bits.
4348 SDValue BitTable = DAG.getConstant(
4349 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4350
4351 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4352 SDValue RoundModeTimesNumBits =
4353 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4354
4355 NewMode =
4356 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4357
4358 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4359 // the table extracted bits into inline immediates.
4360 } else {
4361 // table_index = umin(value, value - 4)
4362 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4363 SDValue BitTable =
4364 DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4365
4366 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4367 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4368 SDValue IndexVal =
4369 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4370
4371 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4372 SDValue RoundModeTimesNumBits =
4373 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4374
4375 SDValue TableValue =
4376 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4377 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4378
4379 // No need to mask out the high bits since the setreg will ignore them
4380 // anyway.
4381 NewMode = TruncTable;
4382 }
4383
4384 // Insert a readfirstlane in case the value is a VGPR. We could do this
4385 // earlier and keep more operations scalar, but that interferes with
4386 // combining the source.
4387 SDValue ReadFirstLaneID =
4388 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4389 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4390 ReadFirstLaneID, NewMode);
4391 }
4392
4393 // N.B. The setreg will be later folded into s_round_mode on supported
4394 // targets.
4395 SDValue IntrinID =
4396 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4397 uint32_t BothRoundHwReg =
4398 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4399 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4400
4401 SDValue SetReg =
4402 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4403 IntrinID, RoundBothImm, NewMode);
4404
4405 return SetReg;
4406 }
4407
lowerPREFETCH(SDValue Op,SelectionDAG & DAG) const4408 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4409 if (Op->isDivergent())
4410 return SDValue();
4411
4412 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4413 case AMDGPUAS::FLAT_ADDRESS:
4414 case AMDGPUAS::GLOBAL_ADDRESS:
4415 case AMDGPUAS::CONSTANT_ADDRESS:
4416 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4417 break;
4418 default:
4419 return SDValue();
4420 }
4421
4422 return Op;
4423 }
4424
4425 // Work around DAG legality rules only based on the result type.
lowerFP_EXTEND(SDValue Op,SelectionDAG & DAG) const4426 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4427 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4428 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4429 EVT SrcVT = Src.getValueType();
4430
4431 if (SrcVT.getScalarType() != MVT::bf16)
4432 return Op;
4433
4434 SDLoc SL(Op);
4435 SDValue BitCast =
4436 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4437
4438 EVT DstVT = Op.getValueType();
4439 if (IsStrict)
4440 llvm_unreachable("Need STRICT_BF16_TO_FP");
4441
4442 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4443 }
4444
lowerGET_FPENV(SDValue Op,SelectionDAG & DAG) const4445 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4446 SDLoc SL(Op);
4447 if (Op.getValueType() != MVT::i64)
4448 return Op;
4449
4450 uint32_t ModeHwReg =
4451 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4452 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4453 uint32_t TrapHwReg =
4454 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4455 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4456
4457 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4458 SDValue IntrinID =
4459 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4460 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4461 Op.getOperand(0), IntrinID, ModeHwRegImm);
4462 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4463 Op.getOperand(0), IntrinID, TrapHwRegImm);
4464 SDValue TokenReg =
4465 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4466 GetTrapReg.getValue(1));
4467
4468 SDValue CvtPtr =
4469 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4470 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4471
4472 return DAG.getMergeValues({Result, TokenReg}, SL);
4473 }
4474
lowerSET_FPENV(SDValue Op,SelectionDAG & DAG) const4475 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4476 SDLoc SL(Op);
4477 if (Op.getOperand(1).getValueType() != MVT::i64)
4478 return Op;
4479
4480 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4481 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4482 DAG.getConstant(0, SL, MVT::i32));
4483 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4484 DAG.getConstant(1, SL, MVT::i32));
4485
4486 SDValue ReadFirstLaneID =
4487 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4488 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4489 ReadFirstLaneID, NewModeReg);
4490 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4491 ReadFirstLaneID, NewTrapReg);
4492
4493 unsigned ModeHwReg =
4494 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4495 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4496 unsigned TrapHwReg =
4497 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4498 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4499
4500 SDValue IntrinID =
4501 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4502 SDValue SetModeReg =
4503 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4504 IntrinID, ModeHwRegImm, NewModeReg);
4505 SDValue SetTrapReg =
4506 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4507 IntrinID, TrapHwRegImm, NewTrapReg);
4508 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4509 }
4510
getRegisterByName(const char * RegName,LLT VT,const MachineFunction & MF) const4511 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4512 const MachineFunction &MF) const {
4513 const Function &Fn = MF.getFunction();
4514
4515 Register Reg = StringSwitch<Register>(RegName)
4516 .Case("m0", AMDGPU::M0)
4517 .Case("exec", AMDGPU::EXEC)
4518 .Case("exec_lo", AMDGPU::EXEC_LO)
4519 .Case("exec_hi", AMDGPU::EXEC_HI)
4520 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4521 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4522 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4523 .Default(Register());
4524 if (!Reg)
4525 return Reg;
4526
4527 if (!Subtarget->hasFlatScrRegister() &&
4528 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4529 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4530 "\" for subtarget."));
4531 }
4532
4533 switch (Reg) {
4534 case AMDGPU::M0:
4535 case AMDGPU::EXEC_LO:
4536 case AMDGPU::EXEC_HI:
4537 case AMDGPU::FLAT_SCR_LO:
4538 case AMDGPU::FLAT_SCR_HI:
4539 if (VT.getSizeInBits() == 32)
4540 return Reg;
4541 break;
4542 case AMDGPU::EXEC:
4543 case AMDGPU::FLAT_SCR:
4544 if (VT.getSizeInBits() == 64)
4545 return Reg;
4546 break;
4547 default:
4548 llvm_unreachable("missing register type checking");
4549 }
4550
4551 report_fatal_error(
4552 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4553 }
4554
4555 // If kill is not the last instruction, split the block so kill is always a
4556 // proper terminator.
4557 MachineBasicBlock *
splitKillBlock(MachineInstr & MI,MachineBasicBlock * BB) const4558 SITargetLowering::splitKillBlock(MachineInstr &MI,
4559 MachineBasicBlock *BB) const {
4560 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4561 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4562 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4563 return SplitBB;
4564 }
4565
4566 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4567 // \p MI will be the only instruction in the loop body block. Otherwise, it will
4568 // be the first instruction in the remainder block.
4569 //
4570 /// \returns { LoopBody, Remainder }
4571 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
splitBlockForLoop(MachineInstr & MI,MachineBasicBlock & MBB,bool InstInLoop)4572 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4573 MachineFunction *MF = MBB.getParent();
4574 MachineBasicBlock::iterator I(&MI);
4575
4576 // To insert the loop we need to split the block. Move everything after this
4577 // point to a new block, and insert a new empty block between the two.
4578 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4579 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4580 MachineFunction::iterator MBBI(MBB);
4581 ++MBBI;
4582
4583 MF->insert(MBBI, LoopBB);
4584 MF->insert(MBBI, RemainderBB);
4585
4586 LoopBB->addSuccessor(LoopBB);
4587 LoopBB->addSuccessor(RemainderBB);
4588
4589 // Move the rest of the block into a new block.
4590 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4591
4592 if (InstInLoop) {
4593 auto Next = std::next(I);
4594
4595 // Move instruction to loop body.
4596 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4597
4598 // Move the rest of the block.
4599 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4600 } else {
4601 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4602 }
4603
4604 MBB.addSuccessor(LoopBB);
4605
4606 return std::pair(LoopBB, RemainderBB);
4607 }
4608
4609 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
bundleInstWithWaitcnt(MachineInstr & MI) const4610 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4611 MachineBasicBlock *MBB = MI.getParent();
4612 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4613 auto I = MI.getIterator();
4614 auto E = std::next(I);
4615
4616 // clang-format off
4617 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4618 .addImm(0);
4619 // clang-format on
4620
4621 MIBundleBuilder Bundler(*MBB, I, E);
4622 finalizeBundle(*MBB, Bundler.begin());
4623 }
4624
4625 MachineBasicBlock *
emitGWSMemViolTestLoop(MachineInstr & MI,MachineBasicBlock * BB) const4626 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4627 MachineBasicBlock *BB) const {
4628 const DebugLoc &DL = MI.getDebugLoc();
4629
4630 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4631
4632 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4633
4634 // Apparently kill flags are only valid if the def is in the same block?
4635 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4636 Src->setIsKill(false);
4637
4638 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4639
4640 MachineBasicBlock::iterator I = LoopBB->end();
4641
4642 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4643 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
4644
4645 // Clear TRAP_STS.MEM_VIOL
4646 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4647 .addImm(0)
4648 .addImm(EncodedReg);
4649
4650 bundleInstWithWaitcnt(MI);
4651
4652 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4653
4654 // Load and check TRAP_STS.MEM_VIOL
4655 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4656 .addImm(EncodedReg);
4657
4658 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4659 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4660 .addReg(Reg, RegState::Kill)
4661 .addImm(0);
4662 // clang-format off
4663 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4664 .addMBB(LoopBB);
4665 // clang-format on
4666
4667 return RemainderBB;
4668 }
4669
4670 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4671 // wavefront. If the value is uniform and just happens to be in a VGPR, this
4672 // will only do one iteration. In the worst case, this will loop 64 times.
4673 //
4674 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4675 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineBasicBlock & OrigBB,MachineBasicBlock & LoopBB,const DebugLoc & DL,const MachineOperand & Idx,unsigned InitReg,unsigned ResultReg,unsigned PhiReg,unsigned InitSaveExecReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)4676 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4677 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4678 const DebugLoc &DL, const MachineOperand &Idx,
4679 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4680 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4681 Register &SGPRIdxReg) {
4682
4683 MachineFunction *MF = OrigBB.getParent();
4684 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4685 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4686 MachineBasicBlock::iterator I = LoopBB.begin();
4687
4688 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4689 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4690 Register NewExec = MRI.createVirtualRegister(BoolRC);
4691 Register CurrentIdxReg =
4692 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4693 Register CondReg = MRI.createVirtualRegister(BoolRC);
4694
4695 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4696 .addReg(InitReg)
4697 .addMBB(&OrigBB)
4698 .addReg(ResultReg)
4699 .addMBB(&LoopBB);
4700
4701 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4702 .addReg(InitSaveExecReg)
4703 .addMBB(&OrigBB)
4704 .addReg(NewExec)
4705 .addMBB(&LoopBB);
4706
4707 // Read the next variant <- also loop target.
4708 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4709 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4710
4711 // Compare the just read M0 value to all possible Idx values.
4712 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4713 .addReg(CurrentIdxReg)
4714 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4715
4716 // Update EXEC, save the original EXEC value to VCC.
4717 BuildMI(LoopBB, I, DL,
4718 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4719 : AMDGPU::S_AND_SAVEEXEC_B64),
4720 NewExec)
4721 .addReg(CondReg, RegState::Kill);
4722
4723 MRI.setSimpleHint(NewExec, CondReg);
4724
4725 if (UseGPRIdxMode) {
4726 if (Offset == 0) {
4727 SGPRIdxReg = CurrentIdxReg;
4728 } else {
4729 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4730 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4731 .addReg(CurrentIdxReg, RegState::Kill)
4732 .addImm(Offset);
4733 }
4734 } else {
4735 // Move index from VCC into M0
4736 if (Offset == 0) {
4737 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4738 .addReg(CurrentIdxReg, RegState::Kill);
4739 } else {
4740 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4741 .addReg(CurrentIdxReg, RegState::Kill)
4742 .addImm(Offset);
4743 }
4744 }
4745
4746 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4747 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4748 MachineInstr *InsertPt =
4749 BuildMI(LoopBB, I, DL,
4750 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4751 : AMDGPU::S_XOR_B64_term),
4752 Exec)
4753 .addReg(Exec)
4754 .addReg(NewExec);
4755
4756 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4757 // s_cbranch_scc0?
4758
4759 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4760 // clang-format off
4761 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4762 .addMBB(&LoopBB);
4763 // clang-format on
4764
4765 return InsertPt->getIterator();
4766 }
4767
4768 // This has slightly sub-optimal regalloc when the source vector is killed by
4769 // the read. The register allocator does not understand that the kill is
4770 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4771 // subregister from it, using 1 more VGPR than necessary. This was saved when
4772 // this was expanded after register allocation.
4773 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo * TII,MachineBasicBlock & MBB,MachineInstr & MI,unsigned InitResultReg,unsigned PhiReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)4774 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4775 unsigned InitResultReg, unsigned PhiReg, int Offset,
4776 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4777 MachineFunction *MF = MBB.getParent();
4778 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4779 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4780 MachineRegisterInfo &MRI = MF->getRegInfo();
4781 const DebugLoc &DL = MI.getDebugLoc();
4782 MachineBasicBlock::iterator I(&MI);
4783
4784 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4785 Register DstReg = MI.getOperand(0).getReg();
4786 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4787 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4788 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4789 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4790
4791 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4792
4793 // Save the EXEC mask
4794 // clang-format off
4795 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4796 .addReg(Exec);
4797 // clang-format on
4798
4799 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4800
4801 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4802
4803 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4804 InitResultReg, DstReg, PhiReg, TmpExec,
4805 Offset, UseGPRIdxMode, SGPRIdxReg);
4806
4807 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4808 MachineFunction::iterator MBBI(LoopBB);
4809 ++MBBI;
4810 MF->insert(MBBI, LandingPad);
4811 LoopBB->removeSuccessor(RemainderBB);
4812 LandingPad->addSuccessor(RemainderBB);
4813 LoopBB->addSuccessor(LandingPad);
4814 MachineBasicBlock::iterator First = LandingPad->begin();
4815 // clang-format off
4816 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4817 .addReg(SaveExec);
4818 // clang-format on
4819
4820 return InsPt;
4821 }
4822
4823 // Returns subreg index, offset
4824 static std::pair<unsigned, int>
computeIndirectRegAndOffset(const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,unsigned VecReg,int Offset)4825 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4826 const TargetRegisterClass *SuperRC, unsigned VecReg,
4827 int Offset) {
4828 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4829
4830 // Skip out of bounds offsets, or else we would end up using an undefined
4831 // register.
4832 if (Offset >= NumElts || Offset < 0)
4833 return std::pair(AMDGPU::sub0, Offset);
4834
4835 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4836 }
4837
setM0ToIndexFromSGPR(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)4838 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4839 MachineRegisterInfo &MRI, MachineInstr &MI,
4840 int Offset) {
4841 MachineBasicBlock *MBB = MI.getParent();
4842 const DebugLoc &DL = MI.getDebugLoc();
4843 MachineBasicBlock::iterator I(&MI);
4844
4845 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4846
4847 assert(Idx->getReg() != AMDGPU::NoRegister);
4848
4849 if (Offset == 0) {
4850 // clang-format off
4851 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4852 .add(*Idx);
4853 // clang-format on
4854 } else {
4855 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4856 .add(*Idx)
4857 .addImm(Offset);
4858 }
4859 }
4860
getIndirectSGPRIdx(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)4861 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4862 MachineRegisterInfo &MRI, MachineInstr &MI,
4863 int Offset) {
4864 MachineBasicBlock *MBB = MI.getParent();
4865 const DebugLoc &DL = MI.getDebugLoc();
4866 MachineBasicBlock::iterator I(&MI);
4867
4868 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4869
4870 if (Offset == 0)
4871 return Idx->getReg();
4872
4873 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4874 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4875 .add(*Idx)
4876 .addImm(Offset);
4877 return Tmp;
4878 }
4879
emitIndirectSrc(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)4880 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4881 MachineBasicBlock &MBB,
4882 const GCNSubtarget &ST) {
4883 const SIInstrInfo *TII = ST.getInstrInfo();
4884 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4885 MachineFunction *MF = MBB.getParent();
4886 MachineRegisterInfo &MRI = MF->getRegInfo();
4887
4888 Register Dst = MI.getOperand(0).getReg();
4889 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4890 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4891 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4892
4893 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4894 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4895
4896 unsigned SubReg;
4897 std::tie(SubReg, Offset) =
4898 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4899
4900 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4901
4902 // Check for a SGPR index.
4903 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4904 MachineBasicBlock::iterator I(&MI);
4905 const DebugLoc &DL = MI.getDebugLoc();
4906
4907 if (UseGPRIdxMode) {
4908 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4909 // to avoid interfering with other uses, so probably requires a new
4910 // optimization pass.
4911 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4912
4913 const MCInstrDesc &GPRIDXDesc =
4914 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4915 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4916 .addReg(SrcReg)
4917 .addReg(Idx)
4918 .addImm(SubReg);
4919 } else {
4920 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4921
4922 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4923 .addReg(SrcReg, 0, SubReg)
4924 .addReg(SrcReg, RegState::Implicit);
4925 }
4926
4927 MI.eraseFromParent();
4928
4929 return &MBB;
4930 }
4931
4932 // Control flow needs to be inserted if indexing with a VGPR.
4933 const DebugLoc &DL = MI.getDebugLoc();
4934 MachineBasicBlock::iterator I(&MI);
4935
4936 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4937 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4938
4939 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4940
4941 Register SGPRIdxReg;
4942 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4943 UseGPRIdxMode, SGPRIdxReg);
4944
4945 MachineBasicBlock *LoopBB = InsPt->getParent();
4946
4947 if (UseGPRIdxMode) {
4948 const MCInstrDesc &GPRIDXDesc =
4949 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4950
4951 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4952 .addReg(SrcReg)
4953 .addReg(SGPRIdxReg)
4954 .addImm(SubReg);
4955 } else {
4956 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4957 .addReg(SrcReg, 0, SubReg)
4958 .addReg(SrcReg, RegState::Implicit);
4959 }
4960
4961 MI.eraseFromParent();
4962
4963 return LoopBB;
4964 }
4965
emitIndirectDst(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)4966 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4967 MachineBasicBlock &MBB,
4968 const GCNSubtarget &ST) {
4969 const SIInstrInfo *TII = ST.getInstrInfo();
4970 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4971 MachineFunction *MF = MBB.getParent();
4972 MachineRegisterInfo &MRI = MF->getRegInfo();
4973
4974 Register Dst = MI.getOperand(0).getReg();
4975 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4976 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4977 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4978 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4979 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4980 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4981
4982 // This can be an immediate, but will be folded later.
4983 assert(Val->getReg());
4984
4985 unsigned SubReg;
4986 std::tie(SubReg, Offset) =
4987 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4988 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4989
4990 if (Idx->getReg() == AMDGPU::NoRegister) {
4991 MachineBasicBlock::iterator I(&MI);
4992 const DebugLoc &DL = MI.getDebugLoc();
4993
4994 assert(Offset == 0);
4995
4996 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4997 .add(*SrcVec)
4998 .add(*Val)
4999 .addImm(SubReg);
5000
5001 MI.eraseFromParent();
5002 return &MBB;
5003 }
5004
5005 // Check for a SGPR index.
5006 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5007 MachineBasicBlock::iterator I(&MI);
5008 const DebugLoc &DL = MI.getDebugLoc();
5009
5010 if (UseGPRIdxMode) {
5011 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5012
5013 const MCInstrDesc &GPRIDXDesc =
5014 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5015 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5016 .addReg(SrcVec->getReg())
5017 .add(*Val)
5018 .addReg(Idx)
5019 .addImm(SubReg);
5020 } else {
5021 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5022
5023 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5024 TRI.getRegSizeInBits(*VecRC), 32, false);
5025 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5026 .addReg(SrcVec->getReg())
5027 .add(*Val)
5028 .addImm(SubReg);
5029 }
5030 MI.eraseFromParent();
5031 return &MBB;
5032 }
5033
5034 // Control flow needs to be inserted if indexing with a VGPR.
5035 if (Val->isReg())
5036 MRI.clearKillFlags(Val->getReg());
5037
5038 const DebugLoc &DL = MI.getDebugLoc();
5039
5040 Register PhiReg = MRI.createVirtualRegister(VecRC);
5041
5042 Register SGPRIdxReg;
5043 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5044 UseGPRIdxMode, SGPRIdxReg);
5045 MachineBasicBlock *LoopBB = InsPt->getParent();
5046
5047 if (UseGPRIdxMode) {
5048 const MCInstrDesc &GPRIDXDesc =
5049 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5050
5051 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5052 .addReg(PhiReg)
5053 .add(*Val)
5054 .addReg(SGPRIdxReg)
5055 .addImm(SubReg);
5056 } else {
5057 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5058 TRI.getRegSizeInBits(*VecRC), 32, false);
5059 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5060 .addReg(PhiReg)
5061 .add(*Val)
5062 .addImm(SubReg);
5063 }
5064
5065 MI.eraseFromParent();
5066 return LoopBB;
5067 }
5068
getIdentityValueForWaveReduction(unsigned Opc)5069 static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5070 switch (Opc) {
5071 case AMDGPU::S_MIN_U32:
5072 return std::numeric_limits<uint32_t>::max();
5073 case AMDGPU::S_MIN_I32:
5074 return std::numeric_limits<int32_t>::max();
5075 case AMDGPU::S_MAX_U32:
5076 return std::numeric_limits<uint32_t>::min();
5077 case AMDGPU::S_MAX_I32:
5078 return std::numeric_limits<int32_t>::min();
5079 case AMDGPU::S_ADD_I32:
5080 case AMDGPU::S_SUB_I32:
5081 case AMDGPU::S_OR_B32:
5082 case AMDGPU::S_XOR_B32:
5083 return std::numeric_limits<uint32_t>::min();
5084 case AMDGPU::S_AND_B32:
5085 return std::numeric_limits<uint32_t>::max();
5086 default:
5087 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5088 }
5089 }
5090
lowerWaveReduce(MachineInstr & MI,MachineBasicBlock & BB,const GCNSubtarget & ST,unsigned Opc)5091 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5092 MachineBasicBlock &BB,
5093 const GCNSubtarget &ST,
5094 unsigned Opc) {
5095 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5096 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5097 const DebugLoc &DL = MI.getDebugLoc();
5098 const SIInstrInfo *TII = ST.getInstrInfo();
5099
5100 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5101 Register SrcReg = MI.getOperand(1).getReg();
5102 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5103 Register DstReg = MI.getOperand(0).getReg();
5104 MachineBasicBlock *RetBB = nullptr;
5105 if (isSGPR) {
5106 switch (Opc) {
5107 case AMDGPU::S_MIN_U32:
5108 case AMDGPU::S_MIN_I32:
5109 case AMDGPU::S_MAX_U32:
5110 case AMDGPU::S_MAX_I32:
5111 case AMDGPU::S_AND_B32:
5112 case AMDGPU::S_OR_B32: {
5113 // Idempotent operations.
5114 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5115 RetBB = &BB;
5116 break;
5117 }
5118 case AMDGPU::S_XOR_B32:
5119 case AMDGPU::S_ADD_I32:
5120 case AMDGPU::S_SUB_I32: {
5121 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5122 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5123 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5124 Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5125
5126 bool IsWave32 = ST.isWave32();
5127 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5128 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5129 unsigned CountReg =
5130 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5131
5132 auto Exec =
5133 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5134
5135 auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5136 .addReg(Exec->getOperand(0).getReg());
5137
5138 switch (Opc) {
5139 case AMDGPU::S_XOR_B32: {
5140 // Performing an XOR operation on a uniform value
5141 // depends on the parity of the number of active lanes.
5142 // For even parity, the result will be 0, for odd
5143 // parity the result will be the same as the input value.
5144 Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5145
5146 auto ParityReg =
5147 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5148 .addReg(NewAccumulator->getOperand(0).getReg())
5149 .addImm(1);
5150 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5151 .addReg(SrcReg)
5152 .addReg(ParityReg->getOperand(0).getReg());
5153 break;
5154 }
5155 case AMDGPU::S_SUB_I32: {
5156 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5157
5158 // Take the negation of the source operand.
5159 auto InvertedValReg =
5160 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5161 .addImm(-1)
5162 .addReg(SrcReg);
5163 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5164 .addReg(InvertedValReg->getOperand(0).getReg())
5165 .addReg(NewAccumulator->getOperand(0).getReg());
5166 break;
5167 }
5168 case AMDGPU::S_ADD_I32: {
5169 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5170 .addReg(SrcReg)
5171 .addReg(NewAccumulator->getOperand(0).getReg());
5172 break;
5173 }
5174 }
5175 RetBB = &BB;
5176 }
5177 }
5178 } else {
5179 // TODO: Implement DPP Strategy and switch based on immediate strategy
5180 // operand. For now, for all the cases (default, Iterative and DPP we use
5181 // iterative approach by default.)
5182
5183 // To reduce the VGPR using iterative approach, we need to iterate
5184 // over all the active lanes. Lowering consists of ComputeLoop,
5185 // which iterate over only active lanes. We use copy of EXEC register
5186 // as induction variable and every active lane modifies it using bitset0
5187 // so that we will get the next active lane for next iteration.
5188 MachineBasicBlock::iterator I = BB.end();
5189 Register SrcReg = MI.getOperand(1).getReg();
5190
5191 // Create Control flow for loop
5192 // Split MI's Machine Basic block into For loop
5193 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5194
5195 // Create virtual registers required for lowering.
5196 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5197 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5198 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5199 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5200
5201 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5202 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5203 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5204
5205 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5206 Register LaneValueReg =
5207 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5208
5209 bool IsWave32 = ST.isWave32();
5210 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5211 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5212
5213 // Create initial values of induction variable from Exec, Accumulator and
5214 // insert branch instr to newly created ComputeBlock
5215 uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5216 auto TmpSReg =
5217 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5218 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5219 .addImm(InitalValue);
5220 // clang-format off
5221 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5222 .addMBB(ComputeLoop);
5223 // clang-format on
5224
5225 // Start constructing ComputeLoop
5226 I = ComputeLoop->end();
5227 auto Accumulator =
5228 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5229 .addReg(InitalValReg)
5230 .addMBB(&BB);
5231 auto ActiveBits =
5232 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5233 .addReg(TmpSReg->getOperand(0).getReg())
5234 .addMBB(&BB);
5235
5236 // Perform the computations
5237 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5238 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5239 .addReg(ActiveBits->getOperand(0).getReg());
5240 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5241 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5242 .addReg(SrcReg)
5243 .addReg(FF1->getOperand(0).getReg());
5244 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5245 .addReg(Accumulator->getOperand(0).getReg())
5246 .addReg(LaneValue->getOperand(0).getReg());
5247
5248 // Manipulate the iterator to get the next active lane
5249 unsigned BITSETOpc =
5250 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5251 auto NewActiveBits =
5252 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5253 .addReg(FF1->getOperand(0).getReg())
5254 .addReg(ActiveBits->getOperand(0).getReg());
5255
5256 // Add phi nodes
5257 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5258 .addMBB(ComputeLoop);
5259 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5260 .addMBB(ComputeLoop);
5261
5262 // Creating branching
5263 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5264 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5265 .addReg(NewActiveBits->getOperand(0).getReg())
5266 .addImm(0);
5267 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5268 .addMBB(ComputeLoop);
5269
5270 RetBB = ComputeEnd;
5271 }
5272 MI.eraseFromParent();
5273 return RetBB;
5274 }
5275
5276 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const5277 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5278 MachineBasicBlock *BB) const {
5279
5280 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5281 MachineFunction *MF = BB->getParent();
5282 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5283
5284 switch (MI.getOpcode()) {
5285 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5286 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5287 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5288 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5289 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5290 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5291 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5292 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5293 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5294 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5295 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5296 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5297 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5298 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5299 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5300 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5301 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5302 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5303 case AMDGPU::S_UADDO_PSEUDO:
5304 case AMDGPU::S_USUBO_PSEUDO: {
5305 const DebugLoc &DL = MI.getDebugLoc();
5306 MachineOperand &Dest0 = MI.getOperand(0);
5307 MachineOperand &Dest1 = MI.getOperand(1);
5308 MachineOperand &Src0 = MI.getOperand(2);
5309 MachineOperand &Src1 = MI.getOperand(3);
5310
5311 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5312 ? AMDGPU::S_ADD_I32
5313 : AMDGPU::S_SUB_I32;
5314 // clang-format off
5315 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5316 .add(Src0)
5317 .add(Src1);
5318 // clang-format on
5319
5320 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5321 .addImm(1)
5322 .addImm(0);
5323
5324 MI.eraseFromParent();
5325 return BB;
5326 }
5327 case AMDGPU::S_ADD_U64_PSEUDO:
5328 case AMDGPU::S_SUB_U64_PSEUDO: {
5329 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5330 // For GFX12, we emit s_add_u64 and s_sub_u64.
5331 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5332 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5333 const DebugLoc &DL = MI.getDebugLoc();
5334 MachineOperand &Dest = MI.getOperand(0);
5335 MachineOperand &Src0 = MI.getOperand(1);
5336 MachineOperand &Src1 = MI.getOperand(2);
5337 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5338 if (Subtarget->hasScalarAddSub64()) {
5339 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5340 // clang-format off
5341 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5342 .add(Src0)
5343 .add(Src1);
5344 // clang-format on
5345 } else {
5346 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5347 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5348
5349 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5350 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351
5352 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5353 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5354 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5355 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5356
5357 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5358 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5359 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5360 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5361
5362 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5363 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5364 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5365 .add(Src0Sub0)
5366 .add(Src1Sub0);
5367 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5368 .add(Src0Sub1)
5369 .add(Src1Sub1);
5370 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5371 .addReg(DestSub0)
5372 .addImm(AMDGPU::sub0)
5373 .addReg(DestSub1)
5374 .addImm(AMDGPU::sub1);
5375 }
5376 MI.eraseFromParent();
5377 return BB;
5378 }
5379 case AMDGPU::V_ADD_U64_PSEUDO:
5380 case AMDGPU::V_SUB_U64_PSEUDO: {
5381 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5382 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5383 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5384 const DebugLoc &DL = MI.getDebugLoc();
5385
5386 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5387
5388 MachineOperand &Dest = MI.getOperand(0);
5389 MachineOperand &Src0 = MI.getOperand(1);
5390 MachineOperand &Src1 = MI.getOperand(2);
5391
5392 if (IsAdd && ST.hasLshlAddU64Inst()) {
5393 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5394 Dest.getReg())
5395 .add(Src0)
5396 .addImm(0)
5397 .add(Src1);
5398 TII->legalizeOperands(*Add);
5399 MI.eraseFromParent();
5400 return BB;
5401 }
5402
5403 const auto *CarryRC = TRI->getWaveMaskRegClass();
5404
5405 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5406 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5407
5408 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5409 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5410
5411 const TargetRegisterClass *Src0RC = Src0.isReg()
5412 ? MRI.getRegClass(Src0.getReg())
5413 : &AMDGPU::VReg_64RegClass;
5414 const TargetRegisterClass *Src1RC = Src1.isReg()
5415 ? MRI.getRegClass(Src1.getReg())
5416 : &AMDGPU::VReg_64RegClass;
5417
5418 const TargetRegisterClass *Src0SubRC =
5419 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5420 const TargetRegisterClass *Src1SubRC =
5421 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5422
5423 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5424 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5425 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5426 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5427
5428 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5429 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5430 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5431 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5432
5433 unsigned LoOpc =
5434 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5435 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5436 .addReg(CarryReg, RegState::Define)
5437 .add(SrcReg0Sub0)
5438 .add(SrcReg1Sub0)
5439 .addImm(0); // clamp bit
5440
5441 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5442 MachineInstr *HiHalf =
5443 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5444 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5445 .add(SrcReg0Sub1)
5446 .add(SrcReg1Sub1)
5447 .addReg(CarryReg, RegState::Kill)
5448 .addImm(0); // clamp bit
5449
5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451 .addReg(DestSub0)
5452 .addImm(AMDGPU::sub0)
5453 .addReg(DestSub1)
5454 .addImm(AMDGPU::sub1);
5455 TII->legalizeOperands(*LoHalf);
5456 TII->legalizeOperands(*HiHalf);
5457 MI.eraseFromParent();
5458 return BB;
5459 }
5460 case AMDGPU::S_ADD_CO_PSEUDO:
5461 case AMDGPU::S_SUB_CO_PSEUDO: {
5462 // This pseudo has a chance to be selected
5463 // only from uniform add/subcarry node. All the VGPR operands
5464 // therefore assumed to be splat vectors.
5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5466 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5467 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5468 MachineBasicBlock::iterator MII = MI;
5469 const DebugLoc &DL = MI.getDebugLoc();
5470 MachineOperand &Dest = MI.getOperand(0);
5471 MachineOperand &CarryDest = MI.getOperand(1);
5472 MachineOperand &Src0 = MI.getOperand(2);
5473 MachineOperand &Src1 = MI.getOperand(3);
5474 MachineOperand &Src2 = MI.getOperand(4);
5475 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5476 ? AMDGPU::S_ADDC_U32
5477 : AMDGPU::S_SUBB_U32;
5478 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5479 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5480 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5481 .addReg(Src0.getReg());
5482 Src0.setReg(RegOp0);
5483 }
5484 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5485 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5486 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5487 .addReg(Src1.getReg());
5488 Src1.setReg(RegOp1);
5489 }
5490 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5491 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5492 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5493 .addReg(Src2.getReg());
5494 Src2.setReg(RegOp2);
5495 }
5496
5497 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5498 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5499 assert(WaveSize == 64 || WaveSize == 32);
5500
5501 if (WaveSize == 64) {
5502 if (ST.hasScalarCompareEq64()) {
5503 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5504 .addReg(Src2.getReg())
5505 .addImm(0);
5506 } else {
5507 const TargetRegisterClass *SubRC =
5508 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5509 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5510 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5511 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5512 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5513 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5514
5515 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5516 .add(Src2Sub0)
5517 .add(Src2Sub1);
5518
5519 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5520 .addReg(Src2_32, RegState::Kill)
5521 .addImm(0);
5522 }
5523 } else {
5524 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5525 .addReg(Src2.getReg())
5526 .addImm(0);
5527 }
5528
5529 // clang-format off
5530 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5531 .add(Src0)
5532 .add(Src1);
5533 // clang-format on
5534
5535 unsigned SelOpc =
5536 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5537
5538 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5539 .addImm(-1)
5540 .addImm(0);
5541
5542 MI.eraseFromParent();
5543 return BB;
5544 }
5545 case AMDGPU::SI_INIT_M0: {
5546 MachineOperand &M0Init = MI.getOperand(0);
5547 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5548 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5549 AMDGPU::M0)
5550 .add(M0Init);
5551 MI.eraseFromParent();
5552 return BB;
5553 }
5554 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5555 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5556 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5557 TII->get(AMDGPU::S_CMP_EQ_U32))
5558 .addImm(0)
5559 .addImm(0);
5560 return BB;
5561 }
5562 case AMDGPU::GET_GROUPSTATICSIZE: {
5563 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5564 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5565 DebugLoc DL = MI.getDebugLoc();
5566 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5567 .add(MI.getOperand(0))
5568 .addImm(MFI->getLDSSize());
5569 MI.eraseFromParent();
5570 return BB;
5571 }
5572 case AMDGPU::GET_SHADERCYCLESHILO: {
5573 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5574 MachineRegisterInfo &MRI = MF->getRegInfo();
5575 const DebugLoc &DL = MI.getDebugLoc();
5576 // The algorithm is:
5577 //
5578 // hi1 = getreg(SHADER_CYCLES_HI)
5579 // lo1 = getreg(SHADER_CYCLES_LO)
5580 // hi2 = getreg(SHADER_CYCLES_HI)
5581 //
5582 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5583 // Otherwise there was overflow and the result is hi2:0. In both cases the
5584 // result should represent the actual time at some point during the sequence
5585 // of three getregs.
5586 using namespace AMDGPU::Hwreg;
5587 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5588 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5589 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5590 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5591 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5592 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5593 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5595 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5596 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5597 .addReg(RegHi1)
5598 .addReg(RegHi2);
5599 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5600 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5601 .addReg(RegLo1)
5602 .addImm(0);
5603 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5604 .add(MI.getOperand(0))
5605 .addReg(RegLo)
5606 .addImm(AMDGPU::sub0)
5607 .addReg(RegHi2)
5608 .addImm(AMDGPU::sub1);
5609 MI.eraseFromParent();
5610 return BB;
5611 }
5612 case AMDGPU::SI_INDIRECT_SRC_V1:
5613 case AMDGPU::SI_INDIRECT_SRC_V2:
5614 case AMDGPU::SI_INDIRECT_SRC_V4:
5615 case AMDGPU::SI_INDIRECT_SRC_V8:
5616 case AMDGPU::SI_INDIRECT_SRC_V9:
5617 case AMDGPU::SI_INDIRECT_SRC_V10:
5618 case AMDGPU::SI_INDIRECT_SRC_V11:
5619 case AMDGPU::SI_INDIRECT_SRC_V12:
5620 case AMDGPU::SI_INDIRECT_SRC_V16:
5621 case AMDGPU::SI_INDIRECT_SRC_V32:
5622 return emitIndirectSrc(MI, *BB, *getSubtarget());
5623 case AMDGPU::SI_INDIRECT_DST_V1:
5624 case AMDGPU::SI_INDIRECT_DST_V2:
5625 case AMDGPU::SI_INDIRECT_DST_V4:
5626 case AMDGPU::SI_INDIRECT_DST_V8:
5627 case AMDGPU::SI_INDIRECT_DST_V9:
5628 case AMDGPU::SI_INDIRECT_DST_V10:
5629 case AMDGPU::SI_INDIRECT_DST_V11:
5630 case AMDGPU::SI_INDIRECT_DST_V12:
5631 case AMDGPU::SI_INDIRECT_DST_V16:
5632 case AMDGPU::SI_INDIRECT_DST_V32:
5633 return emitIndirectDst(MI, *BB, *getSubtarget());
5634 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5635 case AMDGPU::SI_KILL_I1_PSEUDO:
5636 return splitKillBlock(MI, BB);
5637 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5638 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5639 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5640 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5641
5642 Register Dst = MI.getOperand(0).getReg();
5643 const MachineOperand &Src0 = MI.getOperand(1);
5644 const MachineOperand &Src1 = MI.getOperand(2);
5645 const DebugLoc &DL = MI.getDebugLoc();
5646 Register SrcCond = MI.getOperand(3).getReg();
5647
5648 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5649 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5650 const auto *CondRC = TRI->getWaveMaskRegClass();
5651 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5652
5653 const TargetRegisterClass *Src0RC = Src0.isReg()
5654 ? MRI.getRegClass(Src0.getReg())
5655 : &AMDGPU::VReg_64RegClass;
5656 const TargetRegisterClass *Src1RC = Src1.isReg()
5657 ? MRI.getRegClass(Src1.getReg())
5658 : &AMDGPU::VReg_64RegClass;
5659
5660 const TargetRegisterClass *Src0SubRC =
5661 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5662 const TargetRegisterClass *Src1SubRC =
5663 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5664
5665 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5666 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5667 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5668 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5669
5670 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5671 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5672 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5673 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5674
5675 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5676 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5677 .addImm(0)
5678 .add(Src0Sub0)
5679 .addImm(0)
5680 .add(Src1Sub0)
5681 .addReg(SrcCondCopy);
5682 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5683 .addImm(0)
5684 .add(Src0Sub1)
5685 .addImm(0)
5686 .add(Src1Sub1)
5687 .addReg(SrcCondCopy);
5688
5689 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5690 .addReg(DstLo)
5691 .addImm(AMDGPU::sub0)
5692 .addReg(DstHi)
5693 .addImm(AMDGPU::sub1);
5694 MI.eraseFromParent();
5695 return BB;
5696 }
5697 case AMDGPU::SI_BR_UNDEF: {
5698 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5699 const DebugLoc &DL = MI.getDebugLoc();
5700 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5701 .add(MI.getOperand(0));
5702 Br->getOperand(1).setIsUndef(); // read undef SCC
5703 MI.eraseFromParent();
5704 return BB;
5705 }
5706 case AMDGPU::ADJCALLSTACKUP:
5707 case AMDGPU::ADJCALLSTACKDOWN: {
5708 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5709 MachineInstrBuilder MIB(*MF, &MI);
5710 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5711 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5712 return BB;
5713 }
5714 case AMDGPU::SI_CALL_ISEL: {
5715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5716 const DebugLoc &DL = MI.getDebugLoc();
5717
5718 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5719
5720 MachineInstrBuilder MIB;
5721 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5722
5723 for (const MachineOperand &MO : MI.operands())
5724 MIB.add(MO);
5725
5726 MIB.cloneMemRefs(MI);
5727 MI.eraseFromParent();
5728 return BB;
5729 }
5730 case AMDGPU::V_ADD_CO_U32_e32:
5731 case AMDGPU::V_SUB_CO_U32_e32:
5732 case AMDGPU::V_SUBREV_CO_U32_e32: {
5733 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5734 const DebugLoc &DL = MI.getDebugLoc();
5735 unsigned Opc = MI.getOpcode();
5736
5737 bool NeedClampOperand = false;
5738 if (TII->pseudoToMCOpcode(Opc) == -1) {
5739 Opc = AMDGPU::getVOPe64(Opc);
5740 NeedClampOperand = true;
5741 }
5742
5743 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5744 if (TII->isVOP3(*I)) {
5745 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5746 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5747 I.addReg(TRI->getVCC(), RegState::Define);
5748 }
5749 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5750 if (NeedClampOperand)
5751 I.addImm(0); // clamp bit for e64 encoding
5752
5753 TII->legalizeOperands(*I);
5754
5755 MI.eraseFromParent();
5756 return BB;
5757 }
5758 case AMDGPU::V_ADDC_U32_e32:
5759 case AMDGPU::V_SUBB_U32_e32:
5760 case AMDGPU::V_SUBBREV_U32_e32:
5761 // These instructions have an implicit use of vcc which counts towards the
5762 // constant bus limit.
5763 TII->legalizeOperands(MI);
5764 return BB;
5765 case AMDGPU::DS_GWS_INIT:
5766 case AMDGPU::DS_GWS_SEMA_BR:
5767 case AMDGPU::DS_GWS_BARRIER:
5768 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5769 [[fallthrough]];
5770 case AMDGPU::DS_GWS_SEMA_V:
5771 case AMDGPU::DS_GWS_SEMA_P:
5772 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5773 // A s_waitcnt 0 is required to be the instruction immediately following.
5774 if (getSubtarget()->hasGWSAutoReplay()) {
5775 bundleInstWithWaitcnt(MI);
5776 return BB;
5777 }
5778
5779 return emitGWSMemViolTestLoop(MI, BB);
5780 case AMDGPU::S_SETREG_B32: {
5781 // Try to optimize cases that only set the denormal mode or rounding mode.
5782 //
5783 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5784 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5785 // instead.
5786 //
5787 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5788 // allow you to have a no side effect instruction in the output of a
5789 // sideeffecting pattern.
5790 auto [ID, Offset, Width] =
5791 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5792 if (ID != AMDGPU::Hwreg::ID_MODE)
5793 return BB;
5794
5795 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5796 const unsigned SetMask = WidthMask << Offset;
5797
5798 if (getSubtarget()->hasDenormModeInst()) {
5799 unsigned SetDenormOp = 0;
5800 unsigned SetRoundOp = 0;
5801
5802 // The dedicated instructions can only set the whole denorm or round mode
5803 // at once, not a subset of bits in either.
5804 if (SetMask ==
5805 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5806 // If this fully sets both the round and denorm mode, emit the two
5807 // dedicated instructions for these.
5808 SetRoundOp = AMDGPU::S_ROUND_MODE;
5809 SetDenormOp = AMDGPU::S_DENORM_MODE;
5810 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5811 SetRoundOp = AMDGPU::S_ROUND_MODE;
5812 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5813 SetDenormOp = AMDGPU::S_DENORM_MODE;
5814 }
5815
5816 if (SetRoundOp || SetDenormOp) {
5817 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5818 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5819 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5820 unsigned ImmVal = Def->getOperand(1).getImm();
5821 if (SetRoundOp) {
5822 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5823 .addImm(ImmVal & 0xf);
5824
5825 // If we also have the denorm mode, get just the denorm mode bits.
5826 ImmVal >>= 4;
5827 }
5828
5829 if (SetDenormOp) {
5830 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5831 .addImm(ImmVal & 0xf);
5832 }
5833
5834 MI.eraseFromParent();
5835 return BB;
5836 }
5837 }
5838 }
5839
5840 // If only FP bits are touched, used the no side effects pseudo.
5841 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5842 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5843 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5844
5845 return BB;
5846 }
5847 case AMDGPU::S_INVERSE_BALLOT_U32:
5848 case AMDGPU::S_INVERSE_BALLOT_U64:
5849 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5850 // necessary. After that they are equivalent to a COPY.
5851 MI.setDesc(TII->get(AMDGPU::COPY));
5852 return BB;
5853 case AMDGPU::ENDPGM_TRAP: {
5854 const DebugLoc &DL = MI.getDebugLoc();
5855 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5856 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5857 MI.addOperand(MachineOperand::CreateImm(0));
5858 return BB;
5859 }
5860
5861 // We need a block split to make the real endpgm a terminator. We also don't
5862 // want to break phis in successor blocks, so we can't just delete to the
5863 // end of the block.
5864
5865 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5866 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5867 MF->push_back(TrapBB);
5868 // clang-format off
5869 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5870 .addImm(0);
5871 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5872 .addMBB(TrapBB);
5873 // clang-format on
5874
5875 BB->addSuccessor(TrapBB);
5876 MI.eraseFromParent();
5877 return SplitBB;
5878 }
5879 case AMDGPU::SIMULATED_TRAP: {
5880 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5881 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5882 MachineBasicBlock *SplitBB =
5883 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5884 MI.eraseFromParent();
5885 return SplitBB;
5886 }
5887 default:
5888 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5889 if (!MI.mayStore())
5890 AddMemOpInit(MI);
5891 return BB;
5892 }
5893 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
5894 }
5895 }
5896
enableAggressiveFMAFusion(EVT VT) const5897 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5898 // This currently forces unfolding various combinations of fsub into fma with
5899 // free fneg'd operands. As long as we have fast FMA (controlled by
5900 // isFMAFasterThanFMulAndFAdd), we should perform these.
5901
5902 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5903 // most of these combines appear to be cycle neutral but save on instruction
5904 // count / code size.
5905 return true;
5906 }
5907
enableAggressiveFMAFusion(LLT Ty) const5908 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5909
getSetCCResultType(const DataLayout & DL,LLVMContext & Ctx,EVT VT) const5910 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5911 EVT VT) const {
5912 if (!VT.isVector()) {
5913 return MVT::i1;
5914 }
5915 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5916 }
5917
getScalarShiftAmountTy(const DataLayout &,EVT VT) const5918 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5919 // TODO: Should i16 be used always if legal? For now it would force VALU
5920 // shifts.
5921 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5922 }
5923
getPreferredShiftAmountTy(LLT Ty) const5924 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5925 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5926 ? Ty.changeElementSize(16)
5927 : Ty.changeElementSize(32);
5928 }
5929
5930 // Answering this is somewhat tricky and depends on the specific device which
5931 // have different rates for fma or all f64 operations.
5932 //
5933 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5934 // regardless of which device (although the number of cycles differs between
5935 // devices), so it is always profitable for f64.
5936 //
5937 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5938 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
5939 // which we can always do even without fused FP ops since it returns the same
5940 // result as the separate operations and since it is always full
5941 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5942 // however does not support denormals, so we do report fma as faster if we have
5943 // a fast fma device and require denormals.
5944 //
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,EVT VT) const5945 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5946 EVT VT) const {
5947 VT = VT.getScalarType();
5948
5949 switch (VT.getSimpleVT().SimpleTy) {
5950 case MVT::f32: {
5951 // If mad is not available this depends only on if f32 fma is full rate.
5952 if (!Subtarget->hasMadMacF32Insts())
5953 return Subtarget->hasFastFMAF32();
5954
5955 // Otherwise f32 mad is always full rate and returns the same result as
5956 // the separate operations so should be preferred over fma.
5957 // However does not support denormals.
5958 if (!denormalModeIsFlushAllF32(MF))
5959 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5960
5961 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5962 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5963 }
5964 case MVT::f64:
5965 return true;
5966 case MVT::f16:
5967 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5968 default:
5969 break;
5970 }
5971
5972 return false;
5973 }
5974
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,LLT Ty) const5975 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5976 LLT Ty) const {
5977 switch (Ty.getScalarSizeInBits()) {
5978 case 16:
5979 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5980 case 32:
5981 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5982 case 64:
5983 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5984 default:
5985 break;
5986 }
5987
5988 return false;
5989 }
5990
isFMADLegal(const MachineInstr & MI,LLT Ty) const5991 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5992 if (!Ty.isScalar())
5993 return false;
5994
5995 if (Ty.getScalarSizeInBits() == 16)
5996 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5997 if (Ty.getScalarSizeInBits() == 32)
5998 return Subtarget->hasMadMacF32Insts() &&
5999 denormalModeIsFlushAllF32(*MI.getMF());
6000
6001 return false;
6002 }
6003
isFMADLegal(const SelectionDAG & DAG,const SDNode * N) const6004 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6005 const SDNode *N) const {
6006 // TODO: Check future ftz flag
6007 // v_mad_f32/v_mac_f32 do not support denormals.
6008 EVT VT = N->getValueType(0);
6009 if (VT == MVT::f32)
6010 return Subtarget->hasMadMacF32Insts() &&
6011 denormalModeIsFlushAllF32(DAG.getMachineFunction());
6012 if (VT == MVT::f16) {
6013 return Subtarget->hasMadF16() &&
6014 denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
6015 }
6016
6017 return false;
6018 }
6019
6020 //===----------------------------------------------------------------------===//
6021 // Custom DAG Lowering Operations
6022 //===----------------------------------------------------------------------===//
6023
6024 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6025 // wider vector type is legal.
splitUnaryVectorOp(SDValue Op,SelectionDAG & DAG) const6026 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6027 SelectionDAG &DAG) const {
6028 unsigned Opc = Op.getOpcode();
6029 EVT VT = Op.getValueType();
6030 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6031 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6032 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6033 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6034
6035 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6036
6037 SDLoc SL(Op);
6038 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6039 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6040
6041 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6042 }
6043
6044 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6045 // wider vector type is legal.
splitBinaryVectorOp(SDValue Op,SelectionDAG & DAG) const6046 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6047 SelectionDAG &DAG) const {
6048 unsigned Opc = Op.getOpcode();
6049 EVT VT = Op.getValueType();
6050 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6051 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6052 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6053 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6054 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6055 VT == MVT::v32bf16);
6056
6057 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6058 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6059
6060 SDLoc SL(Op);
6061
6062 SDValue OpLo =
6063 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6064 SDValue OpHi =
6065 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6066
6067 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6068 }
6069
splitTernaryVectorOp(SDValue Op,SelectionDAG & DAG) const6070 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6071 SelectionDAG &DAG) const {
6072 unsigned Opc = Op.getOpcode();
6073 EVT VT = Op.getValueType();
6074 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6075 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6076 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6077 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6078 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6079 VT == MVT::v32bf16);
6080
6081 SDValue Op0 = Op.getOperand(0);
6082 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6083 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6084 : std::pair(Op0, Op0);
6085
6086 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6087 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6088
6089 SDLoc SL(Op);
6090 auto ResVT = DAG.GetSplitDestVTs(VT);
6091
6092 SDValue OpLo =
6093 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6094 SDValue OpHi =
6095 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6096
6097 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6098 }
6099
LowerOperation(SDValue Op,SelectionDAG & DAG) const6100 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6101 switch (Op.getOpcode()) {
6102 default:
6103 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
6104 case ISD::BRCOND:
6105 return LowerBRCOND(Op, DAG);
6106 case ISD::RETURNADDR:
6107 return LowerRETURNADDR(Op, DAG);
6108 case ISD::LOAD: {
6109 SDValue Result = LowerLOAD(Op, DAG);
6110 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6111 "Load should return a value and a chain");
6112 return Result;
6113 }
6114 case ISD::FSQRT: {
6115 EVT VT = Op.getValueType();
6116 if (VT == MVT::f32)
6117 return lowerFSQRTF32(Op, DAG);
6118 if (VT == MVT::f64)
6119 return lowerFSQRTF64(Op, DAG);
6120 return SDValue();
6121 }
6122 case ISD::FSIN:
6123 case ISD::FCOS:
6124 return LowerTrig(Op, DAG);
6125 case ISD::SELECT:
6126 return LowerSELECT(Op, DAG);
6127 case ISD::FDIV:
6128 return LowerFDIV(Op, DAG);
6129 case ISD::FFREXP:
6130 return LowerFFREXP(Op, DAG);
6131 case ISD::ATOMIC_CMP_SWAP:
6132 return LowerATOMIC_CMP_SWAP(Op, DAG);
6133 case ISD::STORE:
6134 return LowerSTORE(Op, DAG);
6135 case ISD::GlobalAddress: {
6136 MachineFunction &MF = DAG.getMachineFunction();
6137 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6138 return LowerGlobalAddress(MFI, Op, DAG);
6139 }
6140 case ISD::INTRINSIC_WO_CHAIN:
6141 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6142 case ISD::INTRINSIC_W_CHAIN:
6143 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6144 case ISD::INTRINSIC_VOID:
6145 return LowerINTRINSIC_VOID(Op, DAG);
6146 case ISD::ADDRSPACECAST:
6147 return lowerADDRSPACECAST(Op, DAG);
6148 case ISD::INSERT_SUBVECTOR:
6149 return lowerINSERT_SUBVECTOR(Op, DAG);
6150 case ISD::INSERT_VECTOR_ELT:
6151 return lowerINSERT_VECTOR_ELT(Op, DAG);
6152 case ISD::EXTRACT_VECTOR_ELT:
6153 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6154 case ISD::VECTOR_SHUFFLE:
6155 return lowerVECTOR_SHUFFLE(Op, DAG);
6156 case ISD::SCALAR_TO_VECTOR:
6157 return lowerSCALAR_TO_VECTOR(Op, DAG);
6158 case ISD::BUILD_VECTOR:
6159 return lowerBUILD_VECTOR(Op, DAG);
6160 case ISD::FP_ROUND:
6161 case ISD::STRICT_FP_ROUND:
6162 return lowerFP_ROUND(Op, DAG);
6163 case ISD::TRAP:
6164 return lowerTRAP(Op, DAG);
6165 case ISD::DEBUGTRAP:
6166 return lowerDEBUGTRAP(Op, DAG);
6167 case ISD::ABS:
6168 case ISD::FABS:
6169 case ISD::FNEG:
6170 case ISD::FCANONICALIZE:
6171 case ISD::BSWAP:
6172 return splitUnaryVectorOp(Op, DAG);
6173 case ISD::FMINNUM:
6174 case ISD::FMAXNUM:
6175 return lowerFMINNUM_FMAXNUM(Op, DAG);
6176 case ISD::FMINIMUMNUM:
6177 case ISD::FMAXIMUMNUM:
6178 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6179 case ISD::FMINIMUM:
6180 case ISD::FMAXIMUM:
6181 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6182 case ISD::FLDEXP:
6183 case ISD::STRICT_FLDEXP:
6184 return lowerFLDEXP(Op, DAG);
6185 case ISD::FMA:
6186 return splitTernaryVectorOp(Op, DAG);
6187 case ISD::FP_TO_SINT:
6188 case ISD::FP_TO_UINT:
6189 return LowerFP_TO_INT(Op, DAG);
6190 case ISD::SHL:
6191 case ISD::SRA:
6192 case ISD::SRL:
6193 case ISD::ADD:
6194 case ISD::SUB:
6195 case ISD::SMIN:
6196 case ISD::SMAX:
6197 case ISD::UMIN:
6198 case ISD::UMAX:
6199 case ISD::FADD:
6200 case ISD::FMUL:
6201 case ISD::FMINNUM_IEEE:
6202 case ISD::FMAXNUM_IEEE:
6203 case ISD::UADDSAT:
6204 case ISD::USUBSAT:
6205 case ISD::SADDSAT:
6206 case ISD::SSUBSAT:
6207 return splitBinaryVectorOp(Op, DAG);
6208 case ISD::FCOPYSIGN:
6209 return lowerFCOPYSIGN(Op, DAG);
6210 case ISD::MUL:
6211 return lowerMUL(Op, DAG);
6212 case ISD::SMULO:
6213 case ISD::UMULO:
6214 return lowerXMULO(Op, DAG);
6215 case ISD::SMUL_LOHI:
6216 case ISD::UMUL_LOHI:
6217 return lowerXMUL_LOHI(Op, DAG);
6218 case ISD::DYNAMIC_STACKALLOC:
6219 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6220 case ISD::STACKSAVE:
6221 return LowerSTACKSAVE(Op, DAG);
6222 case ISD::GET_ROUNDING:
6223 return lowerGET_ROUNDING(Op, DAG);
6224 case ISD::SET_ROUNDING:
6225 return lowerSET_ROUNDING(Op, DAG);
6226 case ISD::PREFETCH:
6227 return lowerPREFETCH(Op, DAG);
6228 case ISD::FP_EXTEND:
6229 case ISD::STRICT_FP_EXTEND:
6230 return lowerFP_EXTEND(Op, DAG);
6231 case ISD::GET_FPENV:
6232 return lowerGET_FPENV(Op, DAG);
6233 case ISD::SET_FPENV:
6234 return lowerSET_FPENV(Op, DAG);
6235 }
6236 return SDValue();
6237 }
6238
6239 // Used for D16: Casts the result of an instruction into the right vector,
6240 // packs values if loads return unpacked values.
adjustLoadValueTypeImpl(SDValue Result,EVT LoadVT,const SDLoc & DL,SelectionDAG & DAG,bool Unpacked)6241 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
6242 const SDLoc &DL, SelectionDAG &DAG,
6243 bool Unpacked) {
6244 if (!LoadVT.isVector())
6245 return Result;
6246
6247 // Cast back to the original packed type or to a larger type that is a
6248 // multiple of 32 bit for D16. Widening the return type is a required for
6249 // legalization.
6250 EVT FittingLoadVT = LoadVT;
6251 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6252 FittingLoadVT =
6253 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6254 LoadVT.getVectorNumElements() + 1);
6255 }
6256
6257 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6258 // Truncate to v2i16/v4i16.
6259 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6260
6261 // Workaround legalizer not scalarizing truncate after vector op
6262 // legalization but not creating intermediate vector trunc.
6263 SmallVector<SDValue, 4> Elts;
6264 DAG.ExtractVectorElements(Result, Elts);
6265 for (SDValue &Elt : Elts)
6266 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6267
6268 // Pad illegal v1i16/v3fi6 to v4i16
6269 if ((LoadVT.getVectorNumElements() % 2) == 1)
6270 Elts.push_back(DAG.getPOISON(MVT::i16));
6271
6272 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6273
6274 // Bitcast to original type (v2f16/v4f16).
6275 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6276 }
6277
6278 // Cast back to the original packed type.
6279 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6280 }
6281
adjustLoadValueType(unsigned Opcode,MemSDNode * M,SelectionDAG & DAG,ArrayRef<SDValue> Ops,bool IsIntrinsic) const6282 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6283 SelectionDAG &DAG,
6284 ArrayRef<SDValue> Ops,
6285 bool IsIntrinsic) const {
6286 SDLoc DL(M);
6287
6288 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6289 EVT LoadVT = M->getValueType(0);
6290
6291 EVT EquivLoadVT = LoadVT;
6292 if (LoadVT.isVector()) {
6293 if (Unpacked) {
6294 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6295 LoadVT.getVectorNumElements());
6296 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6297 // Widen v3f16 to legal type
6298 EquivLoadVT =
6299 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6300 LoadVT.getVectorNumElements() + 1);
6301 }
6302 }
6303
6304 // Change from v4f16/v2f16 to EquivLoadVT.
6305 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6306
6307 SDValue Load = DAG.getMemIntrinsicNode(
6308 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6309 M->getMemoryVT(), M->getMemOperand());
6310
6311 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6312
6313 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6314 }
6315
lowerIntrinsicLoad(MemSDNode * M,bool IsFormat,SelectionDAG & DAG,ArrayRef<SDValue> Ops) const6316 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6317 SelectionDAG &DAG,
6318 ArrayRef<SDValue> Ops) const {
6319 SDLoc DL(M);
6320 EVT LoadVT = M->getValueType(0);
6321 EVT EltType = LoadVT.getScalarType();
6322 EVT IntVT = LoadVT.changeTypeToInteger();
6323
6324 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6325
6326 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6327 bool IsTFE = M->getNumValues() == 3;
6328
6329 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6330 : AMDGPUISD::BUFFER_LOAD_FORMAT)
6331 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6332 : AMDGPUISD::BUFFER_LOAD;
6333
6334 if (IsD16) {
6335 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6336 }
6337
6338 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6339 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6340 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6341 IsTFE);
6342
6343 if (isTypeLegal(LoadVT)) {
6344 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6345 M->getMemOperand(), DAG);
6346 }
6347
6348 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6349 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6350 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6351 M->getMemOperand(), DAG);
6352 return DAG.getMergeValues(
6353 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6354 DL);
6355 }
6356
lowerICMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)6357 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6358 SelectionDAG &DAG) {
6359 EVT VT = N->getValueType(0);
6360 unsigned CondCode = N->getConstantOperandVal(3);
6361 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6362 return DAG.getPOISON(VT);
6363
6364 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6365
6366 SDValue LHS = N->getOperand(1);
6367 SDValue RHS = N->getOperand(2);
6368
6369 SDLoc DL(N);
6370
6371 EVT CmpVT = LHS.getValueType();
6372 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6373 unsigned PromoteOp =
6374 ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6375 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6376 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6377 }
6378
6379 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6380
6381 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6382 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6383
6384 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6385 DAG.getCondCode(CCOpcode));
6386 if (VT.bitsEq(CCVT))
6387 return SetCC;
6388 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6389 }
6390
lowerFCMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)6391 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6392 SelectionDAG &DAG) {
6393 EVT VT = N->getValueType(0);
6394
6395 unsigned CondCode = N->getConstantOperandVal(3);
6396 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6397 return DAG.getPOISON(VT);
6398
6399 SDValue Src0 = N->getOperand(1);
6400 SDValue Src1 = N->getOperand(2);
6401 EVT CmpVT = Src0.getValueType();
6402 SDLoc SL(N);
6403
6404 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6405 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6406 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6407 }
6408
6409 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6410 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6411 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6412 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6413 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6414 DAG.getCondCode(CCOpcode));
6415 if (VT.bitsEq(CCVT))
6416 return SetCC;
6417 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6418 }
6419
lowerBALLOTIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)6420 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6421 SelectionDAG &DAG) {
6422 EVT VT = N->getValueType(0);
6423 SDValue Src = N->getOperand(1);
6424 SDLoc SL(N);
6425
6426 if (Src.getOpcode() == ISD::SETCC) {
6427 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6428 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6429 Src.getOperand(1), Src.getOperand(2));
6430 }
6431 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6432 // (ballot 0) -> 0
6433 if (Arg->isZero())
6434 return DAG.getConstant(0, SL, VT);
6435
6436 // (ballot 1) -> EXEC/EXEC_LO
6437 if (Arg->isOne()) {
6438 Register Exec;
6439 if (VT.getScalarSizeInBits() == 32)
6440 Exec = AMDGPU::EXEC_LO;
6441 else if (VT.getScalarSizeInBits() == 64)
6442 Exec = AMDGPU::EXEC;
6443 else
6444 return SDValue();
6445
6446 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6447 }
6448 }
6449
6450 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6451 // ISD::SETNE)
6452 return DAG.getNode(
6453 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6454 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6455 }
6456
lowerLaneOp(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)6457 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6458 SelectionDAG &DAG) {
6459 EVT VT = N->getValueType(0);
6460 unsigned ValSize = VT.getSizeInBits();
6461 unsigned IID = N->getConstantOperandVal(0);
6462 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6463 IID == Intrinsic::amdgcn_permlanex16;
6464 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6465 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6466 SDLoc SL(N);
6467 MVT IntVT = MVT::getIntegerVT(ValSize);
6468 const GCNSubtarget *ST = TLI.getSubtarget();
6469 unsigned SplitSize = 32;
6470 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6471 ST->hasDPALU_DPP() &&
6472 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6473 SplitSize = 64;
6474
6475 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6476 SDValue Src2, MVT ValT) -> SDValue {
6477 SmallVector<SDValue, 8> Operands;
6478 switch (IID) {
6479 case Intrinsic::amdgcn_permlane16:
6480 case Intrinsic::amdgcn_permlanex16:
6481 case Intrinsic::amdgcn_update_dpp:
6482 Operands.push_back(N->getOperand(6));
6483 Operands.push_back(N->getOperand(5));
6484 Operands.push_back(N->getOperand(4));
6485 [[fallthrough]];
6486 case Intrinsic::amdgcn_writelane:
6487 Operands.push_back(Src2);
6488 [[fallthrough]];
6489 case Intrinsic::amdgcn_readlane:
6490 case Intrinsic::amdgcn_set_inactive:
6491 case Intrinsic::amdgcn_set_inactive_chain_arg:
6492 case Intrinsic::amdgcn_mov_dpp8:
6493 Operands.push_back(Src1);
6494 [[fallthrough]];
6495 case Intrinsic::amdgcn_readfirstlane:
6496 case Intrinsic::amdgcn_permlane64:
6497 Operands.push_back(Src0);
6498 break;
6499 default:
6500 llvm_unreachable("unhandled lane op");
6501 }
6502
6503 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6504 std::reverse(Operands.begin(), Operands.end());
6505
6506 if (SDNode *GL = N->getGluedNode()) {
6507 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6508 GL = GL->getOperand(0).getNode();
6509 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6510 SDValue(GL, 0)));
6511 }
6512
6513 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6514 };
6515
6516 SDValue Src0 = N->getOperand(1);
6517 SDValue Src1, Src2;
6518 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6519 IID == Intrinsic::amdgcn_mov_dpp8 ||
6520 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6521 Src1 = N->getOperand(2);
6522 if (IID == Intrinsic::amdgcn_writelane ||
6523 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6524 Src2 = N->getOperand(3);
6525 }
6526
6527 if (ValSize == SplitSize) {
6528 // Already legal
6529 return SDValue();
6530 }
6531
6532 if (ValSize < 32) {
6533 bool IsFloat = VT.isFloatingPoint();
6534 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6535 SL, MVT::i32);
6536
6537 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6538 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6539 SL, MVT::i32);
6540 }
6541
6542 if (IID == Intrinsic::amdgcn_writelane) {
6543 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6544 SL, MVT::i32);
6545 }
6546
6547 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6548 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6549 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6550 }
6551
6552 if (ValSize % SplitSize != 0)
6553 return SDValue();
6554
6555 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6556 EVT VT = N->getValueType(0);
6557 unsigned NE = VT.getVectorNumElements();
6558 EVT EltVT = VT.getVectorElementType();
6559 SmallVector<SDValue, 8> Scalars;
6560 unsigned NumOperands = N->getNumOperands();
6561 SmallVector<SDValue, 4> Operands(NumOperands);
6562 SDNode *GL = N->getGluedNode();
6563
6564 // only handle convergencectrl_glue
6565 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6566
6567 for (unsigned i = 0; i != NE; ++i) {
6568 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6569 ++j) {
6570 SDValue Operand = N->getOperand(j);
6571 EVT OperandVT = Operand.getValueType();
6572 if (OperandVT.isVector()) {
6573 // A vector operand; extract a single element.
6574 EVT OperandEltVT = OperandVT.getVectorElementType();
6575 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6576 Operand, DAG.getVectorIdxConstant(i, SL));
6577 } else {
6578 // A scalar operand; just use it as is.
6579 Operands[j] = Operand;
6580 }
6581 }
6582
6583 if (GL)
6584 Operands[NumOperands - 1] =
6585 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6586 SDValue(GL->getOperand(0).getNode(), 0));
6587
6588 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6589 }
6590
6591 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6592 return DAG.getBuildVector(VecVT, SL, Scalars);
6593 };
6594
6595 if (VT.isVector()) {
6596 switch (MVT::SimpleValueType EltTy =
6597 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6598 case MVT::i32:
6599 case MVT::f32:
6600 if (SplitSize == 32) {
6601 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6602 return unrollLaneOp(LaneOp.getNode());
6603 }
6604 [[fallthrough]];
6605 case MVT::i16:
6606 case MVT::f16:
6607 case MVT::bf16: {
6608 unsigned SubVecNumElt =
6609 SplitSize / VT.getVectorElementType().getSizeInBits();
6610 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6611 SmallVector<SDValue, 4> Pieces;
6612 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6613 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6614 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6615 DAG.getConstant(EltIdx, SL, MVT::i32));
6616
6617 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6618 IsPermLane16)
6619 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6620 DAG.getConstant(EltIdx, SL, MVT::i32));
6621
6622 if (IID == Intrinsic::amdgcn_writelane)
6623 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6624 DAG.getConstant(EltIdx, SL, MVT::i32));
6625
6626 Pieces.push_back(
6627 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6628 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6629 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6630 EltIdx += SubVecNumElt;
6631 }
6632 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6633 }
6634 default:
6635 // Handle all other cases by bitcasting to i32 vectors
6636 break;
6637 }
6638 }
6639
6640 MVT VecVT =
6641 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6642 Src0 = DAG.getBitcast(VecVT, Src0);
6643
6644 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6645 Src1 = DAG.getBitcast(VecVT, Src1);
6646
6647 if (IID == Intrinsic::amdgcn_writelane)
6648 Src2 = DAG.getBitcast(VecVT, Src2);
6649
6650 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6651 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6652 return DAG.getBitcast(VT, UnrolledLaneOp);
6653 }
6654
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const6655 void SITargetLowering::ReplaceNodeResults(SDNode *N,
6656 SmallVectorImpl<SDValue> &Results,
6657 SelectionDAG &DAG) const {
6658 switch (N->getOpcode()) {
6659 case ISD::INSERT_VECTOR_ELT: {
6660 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6661 Results.push_back(Res);
6662 return;
6663 }
6664 case ISD::EXTRACT_VECTOR_ELT: {
6665 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6666 Results.push_back(Res);
6667 return;
6668 }
6669 case ISD::INTRINSIC_WO_CHAIN: {
6670 unsigned IID = N->getConstantOperandVal(0);
6671 switch (IID) {
6672 case Intrinsic::amdgcn_make_buffer_rsrc:
6673 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6674 return;
6675 case Intrinsic::amdgcn_cvt_pkrtz: {
6676 SDValue Src0 = N->getOperand(1);
6677 SDValue Src1 = N->getOperand(2);
6678 SDLoc SL(N);
6679 SDValue Cvt =
6680 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6681 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6682 return;
6683 }
6684 case Intrinsic::amdgcn_cvt_pknorm_i16:
6685 case Intrinsic::amdgcn_cvt_pknorm_u16:
6686 case Intrinsic::amdgcn_cvt_pk_i16:
6687 case Intrinsic::amdgcn_cvt_pk_u16: {
6688 SDValue Src0 = N->getOperand(1);
6689 SDValue Src1 = N->getOperand(2);
6690 SDLoc SL(N);
6691 unsigned Opcode;
6692
6693 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6694 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6695 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6696 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6697 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6698 Opcode = AMDGPUISD::CVT_PK_I16_I32;
6699 else
6700 Opcode = AMDGPUISD::CVT_PK_U16_U32;
6701
6702 EVT VT = N->getValueType(0);
6703 if (isTypeLegal(VT))
6704 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6705 else {
6706 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6707 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6708 }
6709 return;
6710 }
6711 case Intrinsic::amdgcn_s_buffer_load: {
6712 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6713 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6714 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6715 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6716 // s_buffer_load_i8.
6717 if (!Subtarget->hasScalarSubwordLoads())
6718 return;
6719 SDValue Op = SDValue(N, 0);
6720 SDValue Rsrc = Op.getOperand(1);
6721 SDValue Offset = Op.getOperand(2);
6722 SDValue CachePolicy = Op.getOperand(3);
6723 EVT VT = Op.getValueType();
6724 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6725 SDLoc DL(Op);
6726 MachineFunction &MF = DAG.getMachineFunction();
6727 const DataLayout &DataLayout = DAG.getDataLayout();
6728 Align Alignment =
6729 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6730 MachineMemOperand *MMO = MF.getMachineMemOperand(
6731 MachinePointerInfo(),
6732 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6733 MachineMemOperand::MOInvariant,
6734 VT.getStoreSize(), Alignment);
6735 SDValue LoadVal;
6736 if (!Offset->isDivergent()) {
6737 SDValue Ops[] = {Rsrc, // source register
6738 Offset, CachePolicy};
6739 SDValue BufferLoad =
6740 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
6741 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6742 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6743 } else {
6744 SDValue Ops[] = {
6745 DAG.getEntryNode(), // Chain
6746 Rsrc, // rsrc
6747 DAG.getConstant(0, DL, MVT::i32), // vindex
6748 {}, // voffset
6749 {}, // soffset
6750 {}, // offset
6751 CachePolicy, // cachepolicy
6752 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6753 };
6754 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6755 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6756 }
6757 Results.push_back(LoadVal);
6758 return;
6759 }
6760 case Intrinsic::amdgcn_dead: {
6761 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6762 Results.push_back(DAG.getPOISON(N->getValueType(I)));
6763 return;
6764 }
6765 }
6766 break;
6767 }
6768 case ISD::INTRINSIC_W_CHAIN: {
6769 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6770 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6771 // FIXME: Hacky
6772 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6773 Results.push_back(Res.getOperand(I));
6774 }
6775 } else {
6776 Results.push_back(Res);
6777 Results.push_back(Res.getValue(1));
6778 }
6779 return;
6780 }
6781
6782 break;
6783 }
6784 case ISD::SELECT: {
6785 SDLoc SL(N);
6786 EVT VT = N->getValueType(0);
6787 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6788 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6789 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6790
6791 EVT SelectVT = NewVT;
6792 if (NewVT.bitsLT(MVT::i32)) {
6793 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6794 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6795 SelectVT = MVT::i32;
6796 }
6797
6798 SDValue NewSelect =
6799 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6800
6801 if (NewVT != SelectVT)
6802 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6803 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6804 return;
6805 }
6806 case ISD::FNEG: {
6807 if (N->getValueType(0) != MVT::v2f16)
6808 break;
6809
6810 SDLoc SL(N);
6811 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6812
6813 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6814 DAG.getConstant(0x80008000, SL, MVT::i32));
6815 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6816 return;
6817 }
6818 case ISD::FABS: {
6819 if (N->getValueType(0) != MVT::v2f16)
6820 break;
6821
6822 SDLoc SL(N);
6823 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6824
6825 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6826 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6827 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6828 return;
6829 }
6830 case ISD::FSQRT: {
6831 if (N->getValueType(0) != MVT::f16)
6832 break;
6833 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6834 break;
6835 }
6836 default:
6837 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6838 break;
6839 }
6840 }
6841
6842 /// Helper function for LowerBRCOND
findUser(SDValue Value,unsigned Opcode)6843 static SDNode *findUser(SDValue Value, unsigned Opcode) {
6844
6845 for (SDUse &U : Value->uses()) {
6846 if (U.get() != Value)
6847 continue;
6848
6849 if (U.getUser()->getOpcode() == Opcode)
6850 return U.getUser();
6851 }
6852 return nullptr;
6853 }
6854
isCFIntrinsic(const SDNode * Intr) const6855 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6856 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6857 switch (Intr->getConstantOperandVal(1)) {
6858 case Intrinsic::amdgcn_if:
6859 return AMDGPUISD::IF;
6860 case Intrinsic::amdgcn_else:
6861 return AMDGPUISD::ELSE;
6862 case Intrinsic::amdgcn_loop:
6863 return AMDGPUISD::LOOP;
6864 case Intrinsic::amdgcn_end_cf:
6865 llvm_unreachable("should not occur");
6866 default:
6867 return 0;
6868 }
6869 }
6870
6871 // break, if_break, else_break are all only used as inputs to loop, not
6872 // directly as branch conditions.
6873 return 0;
6874 }
6875
shouldEmitFixup(const GlobalValue * GV) const6876 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6877 const Triple &TT = getTargetMachine().getTargetTriple();
6878 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6879 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6880 AMDGPU::shouldEmitConstantsToTextSection(TT);
6881 }
6882
shouldEmitGOTReloc(const GlobalValue * GV) const6883 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6884 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6885 return false;
6886
6887 // FIXME: Either avoid relying on address space here or change the default
6888 // address space for functions to avoid the explicit check.
6889 return (GV->getValueType()->isFunctionTy() ||
6890 !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6891 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6892 }
6893
shouldEmitPCReloc(const GlobalValue * GV) const6894 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6895 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6896 }
6897
shouldUseLDSConstAddress(const GlobalValue * GV) const6898 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6899 if (!GV->hasExternalLinkage())
6900 return true;
6901
6902 const auto OS = getTargetMachine().getTargetTriple().getOS();
6903 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6904 }
6905
6906 /// This transforms the control flow intrinsics to get the branch destination as
6907 /// last parameter, also switches branch target with BR if the need arise
LowerBRCOND(SDValue BRCOND,SelectionDAG & DAG) const6908 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6909 SDLoc DL(BRCOND);
6910
6911 SDNode *Intr = BRCOND.getOperand(1).getNode();
6912 SDValue Target = BRCOND.getOperand(2);
6913 SDNode *BR = nullptr;
6914 SDNode *SetCC = nullptr;
6915
6916 if (Intr->getOpcode() == ISD::SETCC) {
6917 // As long as we negate the condition everything is fine
6918 SetCC = Intr;
6919 Intr = SetCC->getOperand(0).getNode();
6920
6921 } else {
6922 // Get the target from BR if we don't negate the condition
6923 BR = findUser(BRCOND, ISD::BR);
6924 assert(BR && "brcond missing unconditional branch user");
6925 Target = BR->getOperand(1);
6926 }
6927
6928 unsigned CFNode = isCFIntrinsic(Intr);
6929 if (CFNode == 0) {
6930 // This is a uniform branch so we don't need to legalize.
6931 return BRCOND;
6932 }
6933
6934 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6935 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6936
6937 assert(!SetCC ||
6938 (SetCC->getConstantOperandVal(1) == 1 &&
6939 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6940 ISD::SETNE));
6941
6942 // operands of the new intrinsic call
6943 SmallVector<SDValue, 4> Ops;
6944 if (HaveChain)
6945 Ops.push_back(BRCOND.getOperand(0));
6946
6947 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6948 Ops.push_back(Target);
6949
6950 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6951
6952 // build the new intrinsic call
6953 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6954
6955 if (!HaveChain) {
6956 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6957
6958 Result = DAG.getMergeValues(Ops, DL).getNode();
6959 }
6960
6961 if (BR) {
6962 // Give the branch instruction our target
6963 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6964 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6965 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6966 }
6967
6968 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6969
6970 // Copy the intrinsic results to registers
6971 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6972 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
6973 if (!CopyToReg)
6974 continue;
6975
6976 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6977 SDValue(Result, i - 1), SDValue());
6978
6979 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6980 }
6981
6982 // Remove the old intrinsic from the chain
6983 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6984 Intr->getOperand(0));
6985
6986 return Chain;
6987 }
6988
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const6989 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6990 MVT VT = Op.getSimpleValueType();
6991 SDLoc DL(Op);
6992 // Checking the depth
6993 if (Op.getConstantOperandVal(0) != 0)
6994 return DAG.getConstant(0, DL, VT);
6995
6996 MachineFunction &MF = DAG.getMachineFunction();
6997 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6998 // Check for kernel and shader functions
6999 if (Info->isEntryFunction())
7000 return DAG.getConstant(0, DL, VT);
7001
7002 MachineFrameInfo &MFI = MF.getFrameInfo();
7003 // There is a call to @llvm.returnaddress in this function
7004 MFI.setReturnAddressIsTaken(true);
7005
7006 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7007 // Get the return address reg and mark it as an implicit live-in
7008 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7009 getRegClassFor(VT, Op.getNode()->isDivergent()));
7010
7011 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7012 }
7013
getFPExtOrFPRound(SelectionDAG & DAG,SDValue Op,const SDLoc & DL,EVT VT) const7014 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7015 const SDLoc &DL, EVT VT) const {
7016 return Op.getValueType().bitsLE(VT)
7017 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7018 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7019 DAG.getTargetConstant(0, DL, MVT::i32));
7020 }
7021
splitFP_ROUNDVectorOp(SDValue Op,SelectionDAG & DAG) const7022 SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7023 SelectionDAG &DAG) const {
7024 EVT DstVT = Op.getValueType();
7025 unsigned NumElts = DstVT.getVectorNumElements();
7026 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7027
7028 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7029
7030 SDLoc DL(Op);
7031 unsigned Opc = Op.getOpcode();
7032 SDValue Flags = Op.getOperand(1);
7033 EVT HalfDstVT =
7034 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7035 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7036 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7037
7038 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7039 }
7040
lowerFP_ROUND(SDValue Op,SelectionDAG & DAG) const7041 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7042 SDValue Src = Op.getOperand(0);
7043 EVT SrcVT = Src.getValueType();
7044 EVT DstVT = Op.getValueType();
7045
7046 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7047 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7048 if (SrcVT.getScalarType() != MVT::f32)
7049 return SDValue();
7050 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7051 }
7052
7053 if (SrcVT.getScalarType() != MVT::f64)
7054 return Op;
7055
7056 SDLoc DL(Op);
7057 if (DstVT == MVT::f16) {
7058 // TODO: Handle strictfp
7059 if (Op.getOpcode() != ISD::FP_ROUND)
7060 return Op;
7061
7062 if (!Subtarget->has16BitInsts()) {
7063 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7064 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7065 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7066 }
7067 if (getTargetMachine().Options.UnsafeFPMath) {
7068 SDValue Flags = Op.getOperand(1);
7069 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7070 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7071 }
7072 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7073 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7074 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7075 }
7076
7077 assert(DstVT.getScalarType() == MVT::bf16 &&
7078 "custom lower FP_ROUND for f16 or bf16");
7079 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7080
7081 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7082 // hardware f32 -> bf16 instruction.
7083 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7084 MVT::f32;
7085 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7086 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7087 DAG.getTargetConstant(0, DL, MVT::i32));
7088 }
7089
lowerFMINNUM_FMAXNUM(SDValue Op,SelectionDAG & DAG) const7090 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7091 SelectionDAG &DAG) const {
7092 EVT VT = Op.getValueType();
7093 const MachineFunction &MF = DAG.getMachineFunction();
7094 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7095 bool IsIEEEMode = Info->getMode().IEEE;
7096
7097 // FIXME: Assert during selection that this is only selected for
7098 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7099 // mode functions, but this happens to be OK since it's only done in cases
7100 // where there is known no sNaN.
7101 if (IsIEEEMode)
7102 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7103
7104 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7105 VT == MVT::v16bf16)
7106 return splitBinaryVectorOp(Op, DAG);
7107 return Op;
7108 }
7109
7110 SDValue
lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,SelectionDAG & DAG) const7111 SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7112 SelectionDAG &DAG) const {
7113 EVT VT = Op.getValueType();
7114 const MachineFunction &MF = DAG.getMachineFunction();
7115 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7116 bool IsIEEEMode = Info->getMode().IEEE;
7117
7118 if (IsIEEEMode)
7119 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7120
7121 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7122 VT == MVT::v16bf16)
7123 return splitBinaryVectorOp(Op, DAG);
7124 return Op;
7125 }
7126
lowerFMINIMUM_FMAXIMUM(SDValue Op,SelectionDAG & DAG) const7127 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7128 SelectionDAG &DAG) const {
7129 EVT VT = Op.getValueType();
7130 if (VT.isVector())
7131 return splitBinaryVectorOp(Op, DAG);
7132
7133 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7134 !Subtarget->hasMinimum3Maximum3F16() &&
7135 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7136 "should not need to widen f16 minimum/maximum to v2f16");
7137
7138 // Widen f16 operation to v2f16
7139
7140 // fminimum f16:x, f16:y ->
7141 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7142 // (v2f16 (scalar_to_vector y))), 0
7143 SDLoc SL(Op);
7144 SDValue WideSrc0 =
7145 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7146 SDValue WideSrc1 =
7147 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7148
7149 SDValue Widened =
7150 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7151
7152 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7153 DAG.getConstant(0, SL, MVT::i32));
7154 }
7155
lowerFLDEXP(SDValue Op,SelectionDAG & DAG) const7156 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7157 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7158 EVT VT = Op.getValueType();
7159 assert(VT == MVT::f16);
7160
7161 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7162 EVT ExpVT = Exp.getValueType();
7163 if (ExpVT == MVT::i16)
7164 return Op;
7165
7166 SDLoc DL(Op);
7167
7168 // Correct the exponent type for f16 to i16.
7169 // Clamp the range of the exponent to the instruction's range.
7170
7171 // TODO: This should be a generic narrowing legalization, and can easily be
7172 // for GlobalISel.
7173
7174 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7175 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7176
7177 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7178 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7179
7180 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7181
7182 if (IsStrict) {
7183 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7184 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7185 }
7186
7187 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7188 }
7189
getExtOpcodeForPromotedOp(SDValue Op)7190 static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
7191 switch (Op->getOpcode()) {
7192 case ISD::SRA:
7193 case ISD::SMIN:
7194 case ISD::SMAX:
7195 return ISD::SIGN_EXTEND;
7196 case ISD::SRL:
7197 case ISD::UMIN:
7198 case ISD::UMAX:
7199 return ISD::ZERO_EXTEND;
7200 case ISD::ADD:
7201 case ISD::SUB:
7202 case ISD::AND:
7203 case ISD::OR:
7204 case ISD::XOR:
7205 case ISD::SHL:
7206 case ISD::SELECT:
7207 case ISD::MUL:
7208 // operation result won't be influenced by garbage high bits.
7209 // TODO: are all of those cases correct, and are there more?
7210 return ISD::ANY_EXTEND;
7211 case ISD::SETCC: {
7212 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7213 return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7214 }
7215 default:
7216 llvm_unreachable("unexpected opcode!");
7217 }
7218 }
7219
promoteUniformOpToI32(SDValue Op,DAGCombinerInfo & DCI) const7220 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7221 DAGCombinerInfo &DCI) const {
7222 const unsigned Opc = Op.getOpcode();
7223 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7224 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7225 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7226 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7227 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7228
7229 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7230 : Op->getOperand(0).getValueType();
7231 auto ExtTy = OpTy.changeElementType(MVT::i32);
7232
7233 if (DCI.isBeforeLegalizeOps() ||
7234 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7235 return SDValue();
7236
7237 auto &DAG = DCI.DAG;
7238
7239 SDLoc DL(Op);
7240 SDValue LHS;
7241 SDValue RHS;
7242 if (Opc == ISD::SELECT) {
7243 LHS = Op->getOperand(1);
7244 RHS = Op->getOperand(2);
7245 } else {
7246 LHS = Op->getOperand(0);
7247 RHS = Op->getOperand(1);
7248 }
7249
7250 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7251 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7252
7253 // Special case: for shifts, the RHS always needs a zext.
7254 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7255 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7256 else
7257 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7258
7259 // setcc always return i1/i1 vec so no need to truncate after.
7260 if (Opc == ISD::SETCC) {
7261 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7262 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7263 }
7264
7265 // For other ops, we extend the operation's return type as well so we need to
7266 // truncate back to the original type.
7267 SDValue NewVal;
7268 if (Opc == ISD::SELECT)
7269 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7270 else
7271 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7272
7273 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7274 }
7275
lowerFCOPYSIGN(SDValue Op,SelectionDAG & DAG) const7276 SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7277 SDValue Mag = Op.getOperand(0);
7278 EVT MagVT = Mag.getValueType();
7279
7280 if (MagVT.getVectorNumElements() > 2)
7281 return splitBinaryVectorOp(Op, DAG);
7282
7283 SDValue Sign = Op.getOperand(1);
7284 EVT SignVT = Sign.getValueType();
7285
7286 if (MagVT == SignVT)
7287 return Op;
7288
7289 // fcopysign v2f16:mag, v2f32:sign ->
7290 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7291
7292 SDLoc SL(Op);
7293 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7294 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7295
7296 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7297
7298 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7299 }
7300
7301 // Custom lowering for vector multiplications and s_mul_u64.
lowerMUL(SDValue Op,SelectionDAG & DAG) const7302 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7303 EVT VT = Op.getValueType();
7304
7305 // Split vector operands.
7306 if (VT.isVector())
7307 return splitBinaryVectorOp(Op, DAG);
7308
7309 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7310
7311 // There are four ways to lower s_mul_u64:
7312 //
7313 // 1. If all the operands are uniform, then we lower it as it is.
7314 //
7315 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7316 // multiplications because there is not a vector equivalent of s_mul_u64.
7317 //
7318 // 3. If the cost model decides that it is more efficient to use vector
7319 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7320 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7321 //
7322 // 4. If the cost model decides to use vector registers and both of the
7323 // operands are zero-extended/sign-extended from 32-bits, then we split the
7324 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7325 // possible to check if the operands are zero-extended or sign-extended in
7326 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7327 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7328 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7329 // If the cost model decides that we have to use vector registers, then
7330 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7331 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7332 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7333 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7334 // SIInstrInfo.cpp .
7335
7336 if (Op->isDivergent())
7337 return SDValue();
7338
7339 SDValue Op0 = Op.getOperand(0);
7340 SDValue Op1 = Op.getOperand(1);
7341 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7342 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7343 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7344 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7345 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7346 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7347 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7348 SDLoc SL(Op);
7349 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7350 return SDValue(
7351 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7352 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7353 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7354 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7355 return SDValue(
7356 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7357 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7358 return Op;
7359 }
7360
lowerXMULO(SDValue Op,SelectionDAG & DAG) const7361 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7362 EVT VT = Op.getValueType();
7363 SDLoc SL(Op);
7364 SDValue LHS = Op.getOperand(0);
7365 SDValue RHS = Op.getOperand(1);
7366 bool isSigned = Op.getOpcode() == ISD::SMULO;
7367
7368 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7369 const APInt &C = RHSC->getAPIntValue();
7370 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7371 if (C.isPowerOf2()) {
7372 // smulo(x, signed_min) is same as umulo(x, signed_min).
7373 bool UseArithShift = isSigned && !C.isMinSignedValue();
7374 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7375 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7376 SDValue Overflow =
7377 DAG.getSetCC(SL, MVT::i1,
7378 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7379 Result, ShiftAmt),
7380 LHS, ISD::SETNE);
7381 return DAG.getMergeValues({Result, Overflow}, SL);
7382 }
7383 }
7384
7385 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7386 SDValue Top =
7387 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7388
7389 SDValue Sign = isSigned
7390 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7391 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7392 SL, MVT::i32))
7393 : DAG.getConstant(0, SL, VT);
7394 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7395
7396 return DAG.getMergeValues({Result, Overflow}, SL);
7397 }
7398
lowerXMUL_LOHI(SDValue Op,SelectionDAG & DAG) const7399 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7400 if (Op->isDivergent()) {
7401 // Select to V_MAD_[IU]64_[IU]32.
7402 return Op;
7403 }
7404 if (Subtarget->hasSMulHi()) {
7405 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7406 return SDValue();
7407 }
7408 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7409 // calculate the high part, so we might as well do the whole thing with
7410 // V_MAD_[IU]64_[IU]32.
7411 return Op;
7412 }
7413
lowerTRAP(SDValue Op,SelectionDAG & DAG) const7414 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7415 if (!Subtarget->isTrapHandlerEnabled() ||
7416 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7417 return lowerTrapEndpgm(Op, DAG);
7418
7419 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7420 : lowerTrapHsaQueuePtr(Op, DAG);
7421 }
7422
lowerTrapEndpgm(SDValue Op,SelectionDAG & DAG) const7423 SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7424 SDLoc SL(Op);
7425 SDValue Chain = Op.getOperand(0);
7426 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7427 }
7428
7429 SDValue
loadImplicitKernelArgument(SelectionDAG & DAG,MVT VT,const SDLoc & DL,Align Alignment,ImplicitParameter Param) const7430 SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7431 const SDLoc &DL, Align Alignment,
7432 ImplicitParameter Param) const {
7433 MachineFunction &MF = DAG.getMachineFunction();
7434 uint64_t Offset = getImplicitParameterOffset(MF, Param);
7435 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7436 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7437 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7438 MachineMemOperand::MODereferenceable |
7439 MachineMemOperand::MOInvariant);
7440 }
7441
lowerTrapHsaQueuePtr(SDValue Op,SelectionDAG & DAG) const7442 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7443 SelectionDAG &DAG) const {
7444 SDLoc SL(Op);
7445 SDValue Chain = Op.getOperand(0);
7446
7447 SDValue QueuePtr;
7448 // For code object version 5, QueuePtr is passed through implicit kernarg.
7449 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7450 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7451 QueuePtr =
7452 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7453 } else {
7454 MachineFunction &MF = DAG.getMachineFunction();
7455 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7456 Register UserSGPR = Info->getQueuePtrUserSGPR();
7457
7458 if (UserSGPR == AMDGPU::NoRegister) {
7459 // We probably are in a function incorrectly marked with
7460 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7461 // trap, so just use a null pointer.
7462 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7463 } else {
7464 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7465 MVT::i64);
7466 }
7467 }
7468
7469 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7470 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7471
7472 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7473 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7474 ToReg.getValue(1)};
7475 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7476 }
7477
lowerTrapHsa(SDValue Op,SelectionDAG & DAG) const7478 SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7479 SDLoc SL(Op);
7480 SDValue Chain = Op.getOperand(0);
7481
7482 // We need to simulate the 's_trap 2' instruction on targets that run in
7483 // PRIV=1 (where it is treated as a nop).
7484 if (Subtarget->hasPrivEnabledTrap2NopBug())
7485 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7486
7487 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7488 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7489 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7490 }
7491
lowerDEBUGTRAP(SDValue Op,SelectionDAG & DAG) const7492 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7493 SDLoc SL(Op);
7494 SDValue Chain = Op.getOperand(0);
7495 MachineFunction &MF = DAG.getMachineFunction();
7496
7497 if (!Subtarget->isTrapHandlerEnabled() ||
7498 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7499 LLVMContext &Ctx = MF.getFunction().getContext();
7500 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
7501 "debugtrap handler not supported",
7502 Op.getDebugLoc(), DS_Warning));
7503 return Chain;
7504 }
7505
7506 uint64_t TrapID =
7507 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7508 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7509 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7510 }
7511
getSegmentAperture(unsigned AS,const SDLoc & DL,SelectionDAG & DAG) const7512 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7513 SelectionDAG &DAG) const {
7514 if (Subtarget->hasApertureRegs()) {
7515 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7516 ? AMDGPU::SRC_SHARED_BASE
7517 : AMDGPU::SRC_PRIVATE_BASE;
7518 // Note: this feature (register) is broken. When used as a 32-bit operand,
7519 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7520 // bits.
7521 //
7522 // To work around the issue, directly emit a 64 bit mov from this register
7523 // then extract the high bits. Note that this shouldn't even result in a
7524 // shift being emitted and simply become a pair of registers (e.g.):
7525 // s_mov_b64 s[6:7], src_shared_base
7526 // v_mov_b32_e32 v1, s7
7527 //
7528 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7529 // coalescing would kick in and it would think it's okay to use the "HI"
7530 // subregister directly (instead of extracting the HI 32 bits) which is an
7531 // artificial (unusable) register.
7532 // Register TableGen definitions would need an overhaul to get rid of the
7533 // artificial "HI" aperture registers and prevent this kind of issue from
7534 // happening.
7535 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7536 DAG.getRegister(ApertureRegNo, MVT::i64));
7537 return DAG.getNode(
7538 ISD::TRUNCATE, DL, MVT::i32,
7539 DAG.getNode(ISD::SRL, DL, MVT::i64,
7540 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7541 }
7542
7543 // For code object version 5, private_base and shared_base are passed through
7544 // implicit kernargs.
7545 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7546 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7547 ImplicitParameter Param =
7548 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7549 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7550 }
7551
7552 MachineFunction &MF = DAG.getMachineFunction();
7553 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7554 Register UserSGPR = Info->getQueuePtrUserSGPR();
7555 if (UserSGPR == AMDGPU::NoRegister) {
7556 // We probably are in a function incorrectly marked with
7557 // amdgpu-no-queue-ptr. This is undefined.
7558 return DAG.getPOISON(MVT::i32);
7559 }
7560
7561 SDValue QueuePtr =
7562 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7563
7564 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7565 // private_segment_aperture_base_hi.
7566 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7567
7568 SDValue Ptr =
7569 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7570
7571 // TODO: Use custom target PseudoSourceValue.
7572 // TODO: We should use the value from the IR intrinsic call, but it might not
7573 // be available and how do we get it?
7574 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7575 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7576 commonAlignment(Align(64), StructOffset),
7577 MachineMemOperand::MODereferenceable |
7578 MachineMemOperand::MOInvariant);
7579 }
7580
7581 /// Return true if the value is a known valid address, such that a null check is
7582 /// not necessary.
isKnownNonNull(SDValue Val,SelectionDAG & DAG,const AMDGPUTargetMachine & TM,unsigned AddrSpace)7583 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7584 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7585 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7586 return true;
7587
7588 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7589 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7590
7591 // TODO: Search through arithmetic, handle arguments and loads
7592 // marked nonnull.
7593 return false;
7594 }
7595
lowerADDRSPACECAST(SDValue Op,SelectionDAG & DAG) const7596 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7597 SelectionDAG &DAG) const {
7598 SDLoc SL(Op);
7599
7600 const AMDGPUTargetMachine &TM =
7601 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7602
7603 unsigned DestAS, SrcAS;
7604 SDValue Src;
7605 bool IsNonNull = false;
7606 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7607 SrcAS = ASC->getSrcAddressSpace();
7608 Src = ASC->getOperand(0);
7609 DestAS = ASC->getDestAddressSpace();
7610 } else {
7611 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7612 Op.getConstantOperandVal(0) ==
7613 Intrinsic::amdgcn_addrspacecast_nonnull);
7614 Src = Op->getOperand(1);
7615 SrcAS = Op->getConstantOperandVal(2);
7616 DestAS = Op->getConstantOperandVal(3);
7617 IsNonNull = true;
7618 }
7619
7620 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7621
7622 // flat -> local/private
7623 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7624 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7625 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7626 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7627
7628 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7629 return Ptr;
7630
7631 unsigned NullVal = TM.getNullPointerValue(DestAS);
7632 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7633 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7634
7635 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7636 SegmentNullPtr);
7637 }
7638 }
7639
7640 // local/private -> flat
7641 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7642 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7643 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7644
7645 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7646 SDValue CvtPtr =
7647 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7648 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7649
7650 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7651 return CvtPtr;
7652
7653 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7654 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7655
7656 SDValue NonNull =
7657 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7658
7659 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7660 FlatNullPtr);
7661 }
7662 }
7663
7664 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7665 Op.getValueType() == MVT::i64) {
7666 const SIMachineFunctionInfo *Info =
7667 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7668 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7669 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7670 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7671 }
7672
7673 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7674 Src.getValueType() == MVT::i64)
7675 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7676
7677 // global <-> flat are no-ops and never emitted.
7678
7679 // Invalid casts are poison.
7680 return DAG.getPOISON(Op->getValueType(0));
7681 }
7682
7683 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7684 // the small vector and inserting them into the big vector. That is better than
7685 // the default expansion of doing it via a stack slot. Even though the use of
7686 // the stack slot would be optimized away afterwards, the stack slot itself
7687 // remains.
lowerINSERT_SUBVECTOR(SDValue Op,SelectionDAG & DAG) const7688 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7689 SelectionDAG &DAG) const {
7690 SDValue Vec = Op.getOperand(0);
7691 SDValue Ins = Op.getOperand(1);
7692 SDValue Idx = Op.getOperand(2);
7693 EVT VecVT = Vec.getValueType();
7694 EVT InsVT = Ins.getValueType();
7695 EVT EltVT = VecVT.getVectorElementType();
7696 unsigned InsNumElts = InsVT.getVectorNumElements();
7697 unsigned IdxVal = Idx->getAsZExtVal();
7698 SDLoc SL(Op);
7699
7700 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7701 // Insert 32-bit registers at a time.
7702 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7703
7704 unsigned VecNumElts = VecVT.getVectorNumElements();
7705 EVT NewVecVT =
7706 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7707 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7708 : EVT::getVectorVT(*DAG.getContext(),
7709 MVT::i32, InsNumElts / 2);
7710
7711 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7712 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7713
7714 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7715 SDValue Elt;
7716 if (InsNumElts == 2) {
7717 Elt = Ins;
7718 } else {
7719 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7720 DAG.getConstant(I, SL, MVT::i32));
7721 }
7722 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7723 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7724 }
7725
7726 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7727 }
7728
7729 for (unsigned I = 0; I != InsNumElts; ++I) {
7730 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7731 DAG.getConstant(I, SL, MVT::i32));
7732 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7733 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7734 }
7735 return Vec;
7736 }
7737
lowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const7738 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7739 SelectionDAG &DAG) const {
7740 SDValue Vec = Op.getOperand(0);
7741 SDValue InsVal = Op.getOperand(1);
7742 SDValue Idx = Op.getOperand(2);
7743 EVT VecVT = Vec.getValueType();
7744 EVT EltVT = VecVT.getVectorElementType();
7745 unsigned VecSize = VecVT.getSizeInBits();
7746 unsigned EltSize = EltVT.getSizeInBits();
7747 SDLoc SL(Op);
7748
7749 // Specially handle the case of v4i16 with static indexing.
7750 unsigned NumElts = VecVT.getVectorNumElements();
7751 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7752 if (NumElts == 4 && EltSize == 16 && KIdx) {
7753 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7754
7755 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7756 DAG.getConstant(0, SL, MVT::i32));
7757 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7758 DAG.getConstant(1, SL, MVT::i32));
7759
7760 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7761 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7762
7763 unsigned Idx = KIdx->getZExtValue();
7764 bool InsertLo = Idx < 2;
7765 SDValue InsHalf = DAG.getNode(
7766 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7767 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7768 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7769
7770 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7771
7772 SDValue Concat =
7773 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7774 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7775
7776 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7777 }
7778
7779 // Static indexing does not lower to stack access, and hence there is no need
7780 // for special custom lowering to avoid stack access.
7781 if (isa<ConstantSDNode>(Idx))
7782 return SDValue();
7783
7784 // Avoid stack access for dynamic indexing by custom lowering to
7785 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7786
7787 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7788
7789 MVT IntVT = MVT::getIntegerVT(VecSize);
7790
7791 // Convert vector index to bit-index and get the required bit mask.
7792 assert(isPowerOf2_32(EltSize));
7793 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7794 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7795 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7796 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7797 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7798
7799 // 1. Create a congruent vector with the target value in each element.
7800 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7801 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7802
7803 // 2. Mask off all other indices except the required index within (1).
7804 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7805
7806 // 3. Mask off the required index within the target vector.
7807 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7808 SDValue RHS =
7809 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7810
7811 // 4. Get (2) and (3) ORed into the target vector.
7812 SDValue BFI =
7813 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7814
7815 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7816 }
7817
lowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const7818 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7819 SelectionDAG &DAG) const {
7820 SDLoc SL(Op);
7821
7822 EVT ResultVT = Op.getValueType();
7823 SDValue Vec = Op.getOperand(0);
7824 SDValue Idx = Op.getOperand(1);
7825 EVT VecVT = Vec.getValueType();
7826 unsigned VecSize = VecVT.getSizeInBits();
7827 EVT EltVT = VecVT.getVectorElementType();
7828
7829 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7830
7831 // Make sure we do any optimizations that will make it easier to fold
7832 // source modifiers before obscuring it with bit operations.
7833
7834 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7835 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7836 return Combined;
7837
7838 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7839 SDValue Lo, Hi;
7840 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7841
7842 if (VecSize == 128) {
7843 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7844 Lo = DAG.getBitcast(LoVT,
7845 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7846 DAG.getConstant(0, SL, MVT::i32)));
7847 Hi = DAG.getBitcast(HiVT,
7848 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7849 DAG.getConstant(1, SL, MVT::i32)));
7850 } else if (VecSize == 256) {
7851 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7852 SDValue Parts[4];
7853 for (unsigned P = 0; P < 4; ++P) {
7854 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7855 DAG.getConstant(P, SL, MVT::i32));
7856 }
7857
7858 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7859 Parts[0], Parts[1]));
7860 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7861 Parts[2], Parts[3]));
7862 } else {
7863 assert(VecSize == 512);
7864
7865 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7866 SDValue Parts[8];
7867 for (unsigned P = 0; P < 8; ++P) {
7868 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7869 DAG.getConstant(P, SL, MVT::i32));
7870 }
7871
7872 Lo = DAG.getBitcast(LoVT,
7873 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7874 Parts[0], Parts[1], Parts[2], Parts[3]));
7875 Hi = DAG.getBitcast(HiVT,
7876 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7877 Parts[4], Parts[5], Parts[6], Parts[7]));
7878 }
7879
7880 EVT IdxVT = Idx.getValueType();
7881 unsigned NElem = VecVT.getVectorNumElements();
7882 assert(isPowerOf2_32(NElem));
7883 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7884 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7885 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7886 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7887 }
7888
7889 assert(VecSize <= 64);
7890
7891 MVT IntVT = MVT::getIntegerVT(VecSize);
7892
7893 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7894 SDValue VecBC = peekThroughBitcasts(Vec);
7895 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7896 SDValue Src = VecBC.getOperand(0);
7897 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7898 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7899 }
7900
7901 unsigned EltSize = EltVT.getSizeInBits();
7902 assert(isPowerOf2_32(EltSize));
7903
7904 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7905
7906 // Convert vector index to bit-index (* EltSize)
7907 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7908
7909 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7910 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7911
7912 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7913 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7914 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7915 }
7916
7917 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7918 }
7919
elementPairIsContiguous(ArrayRef<int> Mask,int Elt)7920 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7921 assert(Elt % 2 == 0);
7922 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7923 }
7924
elementPairIsOddToEven(ArrayRef<int> Mask,int Elt)7925 static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7926 assert(Elt % 2 == 0);
7927 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7928 !(Mask[Elt + 1] & 1);
7929 }
7930
lowerVECTOR_SHUFFLE(SDValue Op,SelectionDAG & DAG) const7931 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7932 SelectionDAG &DAG) const {
7933 SDLoc SL(Op);
7934 EVT ResultVT = Op.getValueType();
7935 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7936 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7937 const int NewSrcNumElts = 2;
7938 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
7939 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7940
7941 // Break up the shuffle into registers sized pieces.
7942 //
7943 // We're trying to form sub-shuffles that the register allocation pipeline
7944 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7945 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7946 // pair of copies into a consecutive register copy, so use the ordinary
7947 // extract_vector_elt lowering unless we can use the shuffle.
7948 //
7949 // TODO: This is a bit of hack, and we should probably always use
7950 // extract_subvector for the largest possible subvector we can (or at least
7951 // use it for PackVT aligned pieces). However we have worse support for
7952 // combines on them don't directly treat extract_subvector / insert_subvector
7953 // as legal. The DAG scheduler also ends up doing a worse job with the
7954 // extract_subvectors.
7955 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7956
7957 // vector_shuffle <0,1,6,7> lhs, rhs
7958 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7959 //
7960 // vector_shuffle <6,7,2,3> lhs, rhs
7961 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7962 //
7963 // vector_shuffle <6,7,0,1> lhs, rhs
7964 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7965
7966 // Avoid scalarizing when both halves are reading from consecutive elements.
7967
7968 // If we're treating 2 element shuffles as legal, also create odd-to-even
7969 // shuffles of neighboring pairs.
7970 //
7971 // vector_shuffle <3,2,7,6> lhs, rhs
7972 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7973 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
7974
7975 SmallVector<SDValue, 16> Pieces;
7976 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7977 if (ShouldUseConsecutiveExtract &&
7978 elementPairIsContiguous(SVN->getMask(), I)) {
7979 const int Idx = SVN->getMaskElt(I);
7980 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7981 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7982 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7983 SVN->getOperand(VecIdx),
7984 DAG.getConstant(EltIdx, SL, MVT::i32));
7985 Pieces.push_back(SubVec);
7986 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
7987 isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) {
7988 int Idx0 = SVN->getMaskElt(I);
7989 int Idx1 = SVN->getMaskElt(I + 1);
7990
7991 SDValue SrcOp0 = SVN->getOperand(0);
7992 SDValue SrcOp1 = SrcOp0;
7993 if (Idx0 >= SrcNumElts) {
7994 SrcOp0 = SVN->getOperand(1);
7995 Idx0 -= SrcNumElts;
7996 }
7997
7998 if (Idx1 >= SrcNumElts) {
7999 SrcOp1 = SVN->getOperand(1);
8000 Idx1 -= SrcNumElts;
8001 }
8002
8003 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8004 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8005
8006 // Extract nearest even aligned piece.
8007 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8008 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8009 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8010 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8011
8012 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8013 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8014
8015 SDValue Result0 = SubVec0;
8016 SDValue Result1 = SubVec0;
8017
8018 if (SubVec0 != SubVec1) {
8019 NewMaskIdx1 += NewSrcNumElts;
8020 Result1 = SubVec1;
8021 } else {
8022 Result1 = DAG.getPOISON(PackVT);
8023 }
8024
8025 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8026 {NewMaskIdx0, NewMaskIdx1});
8027 Pieces.push_back(Shuf);
8028 } else {
8029 const int Idx0 = SVN->getMaskElt(I);
8030 const int Idx1 = SVN->getMaskElt(I + 1);
8031 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8032 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8033 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8034 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8035
8036 SDValue Vec0 = SVN->getOperand(VecIdx0);
8037 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8038 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8039
8040 SDValue Vec1 = SVN->getOperand(VecIdx1);
8041 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8042 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8043 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8044 }
8045 }
8046
8047 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8048 }
8049
lowerSCALAR_TO_VECTOR(SDValue Op,SelectionDAG & DAG) const8050 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8051 SelectionDAG &DAG) const {
8052 SDValue SVal = Op.getOperand(0);
8053 EVT ResultVT = Op.getValueType();
8054 EVT SValVT = SVal.getValueType();
8055 SDValue UndefVal = DAG.getPOISON(SValVT);
8056 SDLoc SL(Op);
8057
8058 SmallVector<SDValue, 8> VElts;
8059 VElts.push_back(SVal);
8060 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8061 VElts.push_back(UndefVal);
8062
8063 return DAG.getBuildVector(ResultVT, SL, VElts);
8064 }
8065
lowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const8066 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8067 SelectionDAG &DAG) const {
8068 SDLoc SL(Op);
8069 EVT VT = Op.getValueType();
8070
8071 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8072 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8073
8074 SDValue Lo = Op.getOperand(0);
8075 SDValue Hi = Op.getOperand(1);
8076
8077 // Avoid adding defined bits with the zero_extend.
8078 if (Hi.isUndef()) {
8079 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8080 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8081 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8082 }
8083
8084 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8085 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8086
8087 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8088 DAG.getConstant(16, SL, MVT::i32));
8089 if (Lo.isUndef())
8090 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8091
8092 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8093 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8094
8095 SDValue Or =
8096 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8097 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8098 }
8099
8100 // Split into 2-element chunks.
8101 const unsigned NumParts = VT.getVectorNumElements() / 2;
8102 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8103 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8104
8105 SmallVector<SDValue> Casts;
8106 for (unsigned P = 0; P < NumParts; ++P) {
8107 SDValue Vec = DAG.getBuildVector(
8108 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8109 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8110 }
8111
8112 SDValue Blend =
8113 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8114 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8115 }
8116
isOffsetFoldingLegal(const GlobalAddressSDNode * GA) const8117 bool SITargetLowering::isOffsetFoldingLegal(
8118 const GlobalAddressSDNode *GA) const {
8119 // OSes that use ELF REL relocations (instead of RELA) can only store a
8120 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8121 // which can create arbitrary 64-bit addends. (This is only a problem for
8122 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8123 // the high 32 bits of the addend.)
8124 //
8125 // This should be kept in sync with how HasRelocationAddend is initialized in
8126 // the constructor of ELFAMDGPUAsmBackend.
8127 if (!Subtarget->isAmdHsaOS())
8128 return false;
8129
8130 // We can fold offsets for anything that doesn't require a GOT relocation.
8131 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8132 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
8133 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
8134 !shouldEmitGOTReloc(GA->getGlobal());
8135 }
8136
8137 static SDValue
buildPCRelGlobalAddress(SelectionDAG & DAG,const GlobalValue * GV,const SDLoc & DL,int64_t Offset,EVT PtrVT,unsigned GAFlags=SIInstrInfo::MO_NONE)8138 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
8139 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8140 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8141 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8142 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8143 // lowered to the following code sequence:
8144 //
8145 // For constant address space:
8146 // s_getpc_b64 s[0:1]
8147 // s_add_u32 s0, s0, $symbol
8148 // s_addc_u32 s1, s1, 0
8149 //
8150 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8151 // a fixup or relocation is emitted to replace $symbol with a literal
8152 // constant, which is a pc-relative offset from the encoding of the $symbol
8153 // operand to the global variable.
8154 //
8155 // For global address space:
8156 // s_getpc_b64 s[0:1]
8157 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8158 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8159 //
8160 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8161 // fixups or relocations are emitted to replace $symbol@*@lo and
8162 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8163 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8164 // operand to the global variable.
8165 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8166 SDValue PtrHi;
8167 if (GAFlags == SIInstrInfo::MO_NONE)
8168 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8169 else
8170 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8171 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8172 }
8173
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const8174 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8175 SDValue Op,
8176 SelectionDAG &DAG) const {
8177 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8178 SDLoc DL(GSD);
8179 EVT PtrVT = Op.getValueType();
8180
8181 const GlobalValue *GV = GSD->getGlobal();
8182 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8183 shouldUseLDSConstAddress(GV)) ||
8184 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
8185 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
8186 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8187 GV->hasExternalLinkage()) {
8188 Type *Ty = GV->getValueType();
8189 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8190 // zero-sized type in other languages to declare the dynamic shared
8191 // memory which size is not known at the compile time. They will be
8192 // allocated by the runtime and placed directly after the static
8193 // allocated ones. They all share the same offset.
8194 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8195 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8196 // Adjust alignment for that dynamic shared memory array.
8197 Function &F = DAG.getMachineFunction().getFunction();
8198 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
8199 MFI->setUsesDynamicLDS(true);
8200 return SDValue(
8201 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8202 }
8203 }
8204 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
8205 }
8206
8207 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8208 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8209 SIInstrInfo::MO_ABS32_LO);
8210 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8211 }
8212
8213 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8214 SDValue AddrLo = DAG.getTargetGlobalAddress(
8215 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8216 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8217
8218 SDValue AddrHi = DAG.getTargetGlobalAddress(
8219 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8220 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8221
8222 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8223 }
8224
8225 if (shouldEmitFixup(GV))
8226 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8227
8228 if (shouldEmitPCReloc(GV))
8229 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8230 SIInstrInfo::MO_REL32);
8231
8232 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8233 SIInstrInfo::MO_GOTPCREL32);
8234 PointerType *PtrTy =
8235 PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS);
8236 const DataLayout &DataLayout = DAG.getDataLayout();
8237 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8238 MachinePointerInfo PtrInfo =
8239 MachinePointerInfo::getGOT(DAG.getMachineFunction());
8240
8241 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8242 MachineMemOperand::MODereferenceable |
8243 MachineMemOperand::MOInvariant);
8244 }
8245
copyToM0(SelectionDAG & DAG,SDValue Chain,const SDLoc & DL,SDValue V) const8246 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
8247 const SDLoc &DL, SDValue V) const {
8248 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8249 // the destination register.
8250 //
8251 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8252 // so we will end up with redundant moves to m0.
8253 //
8254 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8255
8256 // A Null SDValue creates a glue result.
8257 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8258 V, Chain);
8259 return SDValue(M0, 0);
8260 }
8261
lowerImplicitZextParam(SelectionDAG & DAG,SDValue Op,MVT VT,unsigned Offset) const8262 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8263 MVT VT,
8264 unsigned Offset) const {
8265 SDLoc SL(Op);
8266 SDValue Param = lowerKernargMemParameter(
8267 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8268 // The local size values will have the hi 16-bits as zero.
8269 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8270 DAG.getValueType(VT));
8271 }
8272
emitNonHSAIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)8273 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8274 EVT VT) {
8275 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8276 DAG.getMachineFunction().getFunction(),
8277 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8278 return DAG.getPOISON(VT);
8279 }
8280
emitRemovedIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)8281 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8282 EVT VT) {
8283 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8284 DAG.getMachineFunction().getFunction(),
8285 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8286 return DAG.getPOISON(VT);
8287 }
8288
getBuildDwordsVector(SelectionDAG & DAG,SDLoc DL,ArrayRef<SDValue> Elts)8289 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
8290 ArrayRef<SDValue> Elts) {
8291 assert(!Elts.empty());
8292 MVT Type;
8293 unsigned NumElts = Elts.size();
8294
8295 if (NumElts <= 12) {
8296 Type = MVT::getVectorVT(MVT::f32, NumElts);
8297 } else {
8298 assert(Elts.size() <= 16);
8299 Type = MVT::v16f32;
8300 NumElts = 16;
8301 }
8302
8303 SmallVector<SDValue, 16> VecElts(NumElts);
8304 for (unsigned i = 0; i < Elts.size(); ++i) {
8305 SDValue Elt = Elts[i];
8306 if (Elt.getValueType() != MVT::f32)
8307 Elt = DAG.getBitcast(MVT::f32, Elt);
8308 VecElts[i] = Elt;
8309 }
8310 for (unsigned i = Elts.size(); i < NumElts; ++i)
8311 VecElts[i] = DAG.getPOISON(MVT::f32);
8312
8313 if (NumElts == 1)
8314 return VecElts[0];
8315 return DAG.getBuildVector(Type, DL, VecElts);
8316 }
8317
padEltsToUndef(SelectionDAG & DAG,const SDLoc & DL,EVT CastVT,SDValue Src,int ExtraElts)8318 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8319 SDValue Src, int ExtraElts) {
8320 EVT SrcVT = Src.getValueType();
8321
8322 SmallVector<SDValue, 8> Elts;
8323
8324 if (SrcVT.isVector())
8325 DAG.ExtractVectorElements(Src, Elts);
8326 else
8327 Elts.push_back(Src);
8328
8329 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8330 while (ExtraElts--)
8331 Elts.push_back(Undef);
8332
8333 return DAG.getBuildVector(CastVT, DL, Elts);
8334 }
8335
8336 // Re-construct the required return value for a image load intrinsic.
8337 // This is more complicated due to the optional use TexFailCtrl which means the
8338 // required return type is an aggregate
constructRetValue(SelectionDAG & DAG,MachineSDNode * Result,ArrayRef<EVT> ResultTypes,bool IsTexFail,bool Unpacked,bool IsD16,int DMaskPop,int NumVDataDwords,bool IsAtomicPacked16Bit,const SDLoc & DL)8339 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
8340 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8341 bool Unpacked, bool IsD16, int DMaskPop,
8342 int NumVDataDwords, bool IsAtomicPacked16Bit,
8343 const SDLoc &DL) {
8344 // Determine the required return type. This is the same regardless of
8345 // IsTexFail flag
8346 EVT ReqRetVT = ResultTypes[0];
8347 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8348 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8349 ? (ReqRetNumElts + 1) / 2
8350 : ReqRetNumElts;
8351
8352 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8353
8354 MVT DataDwordVT =
8355 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8356
8357 MVT MaskPopVT =
8358 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8359
8360 SDValue Data(Result, 0);
8361 SDValue TexFail;
8362
8363 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8364 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8365 if (MaskPopVT.isVector()) {
8366 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8367 SDValue(Result, 0), ZeroIdx);
8368 } else {
8369 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8370 SDValue(Result, 0), ZeroIdx);
8371 }
8372 }
8373
8374 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8375 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8376 NumDataDwords - MaskPopDwords);
8377
8378 if (IsD16)
8379 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8380
8381 EVT LegalReqRetVT = ReqRetVT;
8382 if (!ReqRetVT.isVector()) {
8383 if (!Data.getValueType().isInteger())
8384 Data = DAG.getNode(ISD::BITCAST, DL,
8385 Data.getValueType().changeTypeToInteger(), Data);
8386 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8387 } else {
8388 // We need to widen the return vector to a legal type
8389 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8390 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8391 LegalReqRetVT =
8392 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
8393 ReqRetVT.getVectorNumElements() + 1);
8394 }
8395 }
8396 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8397
8398 if (IsTexFail) {
8399 TexFail =
8400 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8401 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8402
8403 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8404 }
8405
8406 if (Result->getNumValues() == 1)
8407 return Data;
8408
8409 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8410 }
8411
parseTexFail(SDValue TexFailCtrl,SelectionDAG & DAG,SDValue * TFE,SDValue * LWE,bool & IsTexFail)8412 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8413 SDValue *LWE, bool &IsTexFail) {
8414 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8415
8416 uint64_t Value = TexFailCtrlConst->getZExtValue();
8417 if (Value) {
8418 IsTexFail = true;
8419 }
8420
8421 SDLoc DL(TexFailCtrlConst);
8422 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8423 Value &= ~(uint64_t)0x1;
8424 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8425 Value &= ~(uint64_t)0x2;
8426
8427 return Value == 0;
8428 }
8429
packImage16bitOpsToDwords(SelectionDAG & DAG,SDValue Op,MVT PackVectorVT,SmallVectorImpl<SDValue> & PackedAddrs,unsigned DimIdx,unsigned EndIdx,unsigned NumGradients)8430 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
8431 MVT PackVectorVT,
8432 SmallVectorImpl<SDValue> &PackedAddrs,
8433 unsigned DimIdx, unsigned EndIdx,
8434 unsigned NumGradients) {
8435 SDLoc DL(Op);
8436 for (unsigned I = DimIdx; I < EndIdx; I++) {
8437 SDValue Addr = Op.getOperand(I);
8438
8439 // Gradients are packed with undef for each coordinate.
8440 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8441 // 1D: undef,dx/dh; undef,dx/dv
8442 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8443 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8444 if (((I + 1) >= EndIdx) ||
8445 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8446 I == DimIdx + NumGradients - 1))) {
8447 if (Addr.getValueType() != MVT::i16)
8448 Addr = DAG.getBitcast(MVT::i16, Addr);
8449 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8450 } else {
8451 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8452 I++;
8453 }
8454 Addr = DAG.getBitcast(MVT::f32, Addr);
8455 PackedAddrs.push_back(Addr);
8456 }
8457 }
8458
lowerImage(SDValue Op,const AMDGPU::ImageDimIntrinsicInfo * Intr,SelectionDAG & DAG,bool WithChain) const8459 SDValue SITargetLowering::lowerImage(SDValue Op,
8460 const AMDGPU::ImageDimIntrinsicInfo *Intr,
8461 SelectionDAG &DAG, bool WithChain) const {
8462 SDLoc DL(Op);
8463 MachineFunction &MF = DAG.getMachineFunction();
8464 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8465 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8466 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
8467 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8468 unsigned IntrOpcode = Intr->BaseOpcode;
8469 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8470 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8471 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8472
8473 SmallVector<EVT, 3> ResultTypes(Op->values());
8474 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8475 bool IsD16 = false;
8476 bool IsG16 = false;
8477 bool IsA16 = false;
8478 SDValue VData;
8479 int NumVDataDwords = 0;
8480 bool AdjustRetType = false;
8481 bool IsAtomicPacked16Bit = false;
8482
8483 // Offset of intrinsic arguments
8484 const unsigned ArgOffset = WithChain ? 2 : 1;
8485
8486 unsigned DMask;
8487 unsigned DMaskLanes = 0;
8488
8489 if (BaseOpcode->Atomic) {
8490 VData = Op.getOperand(2);
8491
8492 IsAtomicPacked16Bit =
8493 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8494 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8495
8496 bool Is64Bit = VData.getValueSizeInBits() == 64;
8497 if (BaseOpcode->AtomicX2) {
8498 SDValue VData2 = Op.getOperand(3);
8499 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8500 {VData, VData2});
8501 if (Is64Bit)
8502 VData = DAG.getBitcast(MVT::v4i32, VData);
8503
8504 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8505 DMask = Is64Bit ? 0xf : 0x3;
8506 NumVDataDwords = Is64Bit ? 4 : 2;
8507 } else {
8508 DMask = Is64Bit ? 0x3 : 0x1;
8509 NumVDataDwords = Is64Bit ? 2 : 1;
8510 }
8511 } else {
8512 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8513 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8514
8515 if (BaseOpcode->Store) {
8516 VData = Op.getOperand(2);
8517
8518 MVT StoreVT = VData.getSimpleValueType();
8519 if (StoreVT.getScalarType() == MVT::f16) {
8520 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8521 return Op; // D16 is unsupported for this instruction
8522
8523 IsD16 = true;
8524 VData = handleD16VData(VData, DAG, true);
8525 }
8526
8527 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8528 } else if (!BaseOpcode->NoReturn) {
8529 // Work out the num dwords based on the dmask popcount and underlying type
8530 // and whether packing is supported.
8531 MVT LoadVT = ResultTypes[0].getSimpleVT();
8532 if (LoadVT.getScalarType() == MVT::f16) {
8533 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8534 return Op; // D16 is unsupported for this instruction
8535
8536 IsD16 = true;
8537 }
8538
8539 // Confirm that the return type is large enough for the dmask specified
8540 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8541 (!LoadVT.isVector() && DMaskLanes > 1))
8542 return Op;
8543
8544 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8545 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8546 // instructions.
8547 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8548 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8549 NumVDataDwords = (DMaskLanes + 1) / 2;
8550 else
8551 NumVDataDwords = DMaskLanes;
8552
8553 AdjustRetType = true;
8554 }
8555 }
8556
8557 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8558 SmallVector<SDValue, 4> VAddrs;
8559
8560 // Check for 16 bit addresses or derivatives and pack if true.
8561 MVT VAddrVT =
8562 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8563 MVT VAddrScalarVT = VAddrVT.getScalarType();
8564 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8565 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8566
8567 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8568 VAddrScalarVT = VAddrVT.getScalarType();
8569 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8570 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8571
8572 // Push back extra arguments.
8573 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8574 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8575 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8576 // Special handling of bias when A16 is on. Bias is of type half but
8577 // occupies full 32-bit.
8578 SDValue Bias = DAG.getBuildVector(
8579 MVT::v2f16, DL,
8580 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
8581 VAddrs.push_back(Bias);
8582 } else {
8583 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8584 "Bias needs to be converted to 16 bit in A16 mode");
8585 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8586 }
8587 }
8588
8589 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8590 // 16 bit gradients are supported, but are tied to the A16 control
8591 // so both gradients and addresses must be 16 bit
8592 LLVM_DEBUG(
8593 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8594 "require 16 bit args for both gradients and addresses");
8595 return Op;
8596 }
8597
8598 if (IsA16) {
8599 if (!ST->hasA16()) {
8600 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8601 "support 16 bit addresses\n");
8602 return Op;
8603 }
8604 }
8605
8606 // We've dealt with incorrect input so we know that if IsA16, IsG16
8607 // are set then we have to compress/pack operands (either address,
8608 // gradient or both)
8609 // In the case where a16 and gradients are tied (no G16 support) then we
8610 // have already verified that both IsA16 and IsG16 are true
8611 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8612 // Activate g16
8613 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8614 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8615 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8616 }
8617
8618 // Add gradients (packed or unpacked)
8619 if (IsG16) {
8620 // Pack the gradients
8621 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8622 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8623 ArgOffset + Intr->GradientStart,
8624 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8625 } else {
8626 for (unsigned I = ArgOffset + Intr->GradientStart;
8627 I < ArgOffset + Intr->CoordStart; I++)
8628 VAddrs.push_back(Op.getOperand(I));
8629 }
8630
8631 // Add addresses (packed or unpacked)
8632 if (IsA16) {
8633 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8634 ArgOffset + Intr->CoordStart, VAddrEnd,
8635 0 /* No gradients */);
8636 } else {
8637 // Add uncompressed address
8638 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8639 VAddrs.push_back(Op.getOperand(I));
8640 }
8641
8642 // If the register allocator cannot place the address registers contiguously
8643 // without introducing moves, then using the non-sequential address encoding
8644 // is always preferable, since it saves VALU instructions and is usually a
8645 // wash in terms of code size or even better.
8646 //
8647 // However, we currently have no way of hinting to the register allocator that
8648 // MIMG addresses should be placed contiguously when it is possible to do so,
8649 // so force non-NSA for the common 2-address case as a heuristic.
8650 //
8651 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8652 // allocation when possible.
8653 //
8654 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8655 // set of the remaining addresses.
8656 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8657 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8658 const bool UseNSA = ST->hasNSAEncoding() &&
8659 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8660 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8661 const bool UsePartialNSA =
8662 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8663
8664 SDValue VAddr;
8665 if (UsePartialNSA) {
8666 VAddr = getBuildDwordsVector(DAG, DL,
8667 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8668 } else if (!UseNSA) {
8669 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8670 }
8671
8672 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8673 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8674 SDValue Unorm;
8675 if (!BaseOpcode->Sampler) {
8676 Unorm = True;
8677 } else {
8678 uint64_t UnormConst =
8679 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8680
8681 Unorm = UnormConst ? True : False;
8682 }
8683
8684 SDValue TFE;
8685 SDValue LWE;
8686 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8687 bool IsTexFail = false;
8688 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8689 return Op;
8690
8691 if (IsTexFail) {
8692 if (!DMaskLanes) {
8693 // Expecting to get an error flag since TFC is on - and dmask is 0
8694 // Force dmask to be at least 1 otherwise the instruction will fail
8695 DMask = 0x1;
8696 DMaskLanes = 1;
8697 NumVDataDwords = 1;
8698 }
8699 NumVDataDwords += 1;
8700 AdjustRetType = true;
8701 }
8702
8703 // Has something earlier tagged that the return type needs adjusting
8704 // This happens if the instruction is a load or has set TexFailCtrl flags
8705 if (AdjustRetType) {
8706 // NumVDataDwords reflects the true number of dwords required in the return
8707 // type
8708 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8709 // This is a no-op load. This can be eliminated
8710 SDValue Undef = DAG.getPOISON(Op.getValueType());
8711 if (isa<MemSDNode>(Op))
8712 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8713 return Undef;
8714 }
8715
8716 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8717 MVT::i32, NumVDataDwords)
8718 : MVT::i32;
8719
8720 ResultTypes[0] = NewVT;
8721 if (ResultTypes.size() == 3) {
8722 // Original result was aggregate type used for TexFailCtrl results
8723 // The actual instruction returns as a vector type which has now been
8724 // created. Remove the aggregate result.
8725 ResultTypes.erase(&ResultTypes[1]);
8726 }
8727 }
8728
8729 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8730 if (BaseOpcode->Atomic)
8731 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8732 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8733 AMDGPU::CPol::VOLATILE))
8734 return Op;
8735
8736 SmallVector<SDValue, 26> Ops;
8737 if (BaseOpcode->Store || BaseOpcode->Atomic)
8738 Ops.push_back(VData); // vdata
8739 if (UsePartialNSA) {
8740 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8741 Ops.push_back(VAddr);
8742 } else if (UseNSA)
8743 append_range(Ops, VAddrs);
8744 else
8745 Ops.push_back(VAddr);
8746 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8747 EVT RsrcVT = Rsrc.getValueType();
8748 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8749 return Op;
8750 Ops.push_back(Rsrc);
8751 if (BaseOpcode->Sampler) {
8752 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8753 if (Samp.getValueType() != MVT::v4i32)
8754 return Op;
8755 Ops.push_back(Samp);
8756 }
8757 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8758 if (IsGFX10Plus)
8759 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8760 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8761 Ops.push_back(Unorm);
8762 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8763 Ops.push_back(IsA16 && // r128, a16 for gfx9
8764 ST->hasFeature(AMDGPU::FeatureR128A16)
8765 ? True
8766 : False);
8767 if (IsGFX10Plus)
8768 Ops.push_back(IsA16 ? True : False);
8769
8770 if (!Subtarget->hasGFX90AInsts())
8771 Ops.push_back(TFE); // tfe
8772 else if (TFE->getAsZExtVal()) {
8773 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8774 DAG.getMachineFunction().getFunction(),
8775 "TFE is not supported on this GPU", DL.getDebugLoc()));
8776 }
8777
8778 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8779 Ops.push_back(LWE); // lwe
8780 if (!IsGFX10Plus)
8781 Ops.push_back(DimInfo->DA ? True : False);
8782 if (BaseOpcode->HasD16)
8783 Ops.push_back(IsD16 ? True : False);
8784 if (isa<MemSDNode>(Op))
8785 Ops.push_back(Op.getOperand(0)); // chain
8786
8787 int NumVAddrDwords =
8788 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8789 int Opcode = -1;
8790
8791 if (IsGFX12Plus) {
8792 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8793 NumVDataDwords, NumVAddrDwords);
8794 } else if (IsGFX11Plus) {
8795 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8796 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8797 : AMDGPU::MIMGEncGfx11Default,
8798 NumVDataDwords, NumVAddrDwords);
8799 } else if (IsGFX10Plus) {
8800 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8801 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8802 : AMDGPU::MIMGEncGfx10Default,
8803 NumVDataDwords, NumVAddrDwords);
8804 } else {
8805 if (Subtarget->hasGFX90AInsts()) {
8806 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8807 NumVDataDwords, NumVAddrDwords);
8808 if (Opcode == -1) {
8809 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8810 DAG.getMachineFunction().getFunction(),
8811 "requested image instruction is not supported on this GPU",
8812 DL.getDebugLoc()));
8813
8814 unsigned Idx = 0;
8815 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
8816 for (EVT VT : OrigResultTypes) {
8817 if (VT == MVT::Other)
8818 RetValues[Idx++] = Op.getOperand(0); // Chain
8819 else
8820 RetValues[Idx++] = DAG.getPOISON(VT);
8821 }
8822
8823 return DAG.getMergeValues(RetValues, DL);
8824 }
8825 }
8826 if (Opcode == -1 &&
8827 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8828 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8829 NumVDataDwords, NumVAddrDwords);
8830 if (Opcode == -1)
8831 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8832 NumVDataDwords, NumVAddrDwords);
8833 }
8834 if (Opcode == -1)
8835 return Op;
8836
8837 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8838 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8839 MachineMemOperand *MemRef = MemOp->getMemOperand();
8840 DAG.setNodeMemRefs(NewNode, {MemRef});
8841 }
8842
8843 if (BaseOpcode->AtomicX2) {
8844 SmallVector<SDValue, 1> Elt;
8845 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8846 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8847 }
8848 if (BaseOpcode->NoReturn)
8849 return SDValue(NewNode, 0);
8850 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8851 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8852 NumVDataDwords, IsAtomicPacked16Bit, DL);
8853 }
8854
lowerSBuffer(EVT VT,SDLoc DL,SDValue Rsrc,SDValue Offset,SDValue CachePolicy,SelectionDAG & DAG) const8855 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8856 SDValue Offset, SDValue CachePolicy,
8857 SelectionDAG &DAG) const {
8858 MachineFunction &MF = DAG.getMachineFunction();
8859
8860 const DataLayout &DataLayout = DAG.getDataLayout();
8861 Align Alignment =
8862 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
8863
8864 MachineMemOperand *MMO = MF.getMachineMemOperand(
8865 MachinePointerInfo(),
8866 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8867 MachineMemOperand::MOInvariant,
8868 VT.getStoreSize(), Alignment);
8869
8870 if (!Offset->isDivergent()) {
8871 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8872
8873 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8874 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8875 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8876 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8877 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8878 SDValue BufferLoad =
8879 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
8880 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8881 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8882 }
8883
8884 // Widen vec3 load to vec4.
8885 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8886 !Subtarget->hasScalarDwordx3Loads()) {
8887 EVT WidenedVT =
8888 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
8889 auto WidenedOp = DAG.getMemIntrinsicNode(
8890 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8891 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8892 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8893 DAG.getVectorIdxConstant(0, DL));
8894 return Subvector;
8895 }
8896
8897 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
8898 DAG.getVTList(VT), Ops, VT, MMO);
8899 }
8900
8901 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8902 // assume that the buffer is unswizzled.
8903 SDValue Ops[] = {
8904 DAG.getEntryNode(), // Chain
8905 Rsrc, // rsrc
8906 DAG.getConstant(0, DL, MVT::i32), // vindex
8907 {}, // voffset
8908 {}, // soffset
8909 {}, // offset
8910 CachePolicy, // cachepolicy
8911 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8912 };
8913 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8914 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8915 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8916 }
8917
8918 SmallVector<SDValue, 4> Loads;
8919 unsigned NumLoads = 1;
8920 MVT LoadVT = VT.getSimpleVT();
8921 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8922 assert((LoadVT.getScalarType() == MVT::i32 ||
8923 LoadVT.getScalarType() == MVT::f32));
8924
8925 if (NumElts == 8 || NumElts == 16) {
8926 NumLoads = NumElts / 4;
8927 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8928 }
8929
8930 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
8931
8932 // Use the alignment to ensure that the required offsets will fit into the
8933 // immediate offsets.
8934 setBufferOffsets(Offset, DAG, &Ops[3],
8935 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8936
8937 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8938 for (unsigned i = 0; i < NumLoads; ++i) {
8939 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8940 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8941 LoadVT, MMO, DAG));
8942 }
8943
8944 if (NumElts == 8 || NumElts == 16)
8945 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8946
8947 return Loads[0];
8948 }
8949
lowerWaveID(SelectionDAG & DAG,SDValue Op) const8950 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8951 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8952 if (!Subtarget->hasArchitectedSGPRs())
8953 return {};
8954 SDLoc SL(Op);
8955 MVT VT = MVT::i32;
8956 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8957 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8958 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8959 }
8960
lowerWorkitemID(SelectionDAG & DAG,SDValue Op,unsigned Dim,const ArgDescriptor & Arg) const8961 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8962 unsigned Dim,
8963 const ArgDescriptor &Arg) const {
8964 SDLoc SL(Op);
8965 MachineFunction &MF = DAG.getMachineFunction();
8966 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8967 if (MaxID == 0)
8968 return DAG.getConstant(0, SL, MVT::i32);
8969
8970 // It's undefined behavior if a function marked with the amdgpu-no-*
8971 // attributes uses the corresponding intrinsic.
8972 if (!Arg)
8973 return DAG.getPOISON(Op->getValueType(0));
8974
8975 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8976 SDLoc(DAG.getEntryNode()), Arg);
8977
8978 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8979 // masking operations anyway.
8980 //
8981 // TODO: We could assert the top bit is 0 for the source copy.
8982 if (Arg.isMasked())
8983 return Val;
8984
8985 // Preserve the known bits after expansion to a copy.
8986 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
8987 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8988 DAG.getValueType(SmallVT));
8989 }
8990
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const8991 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8992 SelectionDAG &DAG) const {
8993 MachineFunction &MF = DAG.getMachineFunction();
8994 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8995
8996 EVT VT = Op.getValueType();
8997 SDLoc DL(Op);
8998 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8999
9000 // TODO: Should this propagate fast-math-flags?
9001
9002 switch (IntrinsicID) {
9003 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9004 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9005 return emitNonHSAIntrinsicError(DAG, DL, VT);
9006 return getPreloadedValue(DAG, *MFI, VT,
9007 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
9008 }
9009 case Intrinsic::amdgcn_dispatch_ptr:
9010 case Intrinsic::amdgcn_queue_ptr: {
9011 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9012 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9013 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9014 DL.getDebugLoc()));
9015 return DAG.getPOISON(VT);
9016 }
9017
9018 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9019 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
9020 : AMDGPUFunctionArgInfo::QUEUE_PTR;
9021 return getPreloadedValue(DAG, *MFI, VT, RegID);
9022 }
9023 case Intrinsic::amdgcn_implicitarg_ptr: {
9024 if (MFI->isEntryFunction())
9025 return getImplicitArgPtr(DAG, DL);
9026 return getPreloadedValue(DAG, *MFI, VT,
9027 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
9028 }
9029 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9030 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
9031 // This only makes sense to call in a kernel, so just lower to null.
9032 return DAG.getConstant(0, DL, VT);
9033 }
9034
9035 return getPreloadedValue(DAG, *MFI, VT,
9036 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
9037 }
9038 case Intrinsic::amdgcn_dispatch_id: {
9039 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9040 }
9041 case Intrinsic::amdgcn_rcp:
9042 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9043 case Intrinsic::amdgcn_rsq:
9044 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9045 case Intrinsic::amdgcn_rsq_legacy:
9046 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9047 return emitRemovedIntrinsicError(DAG, DL, VT);
9048 return SDValue();
9049 case Intrinsic::amdgcn_rcp_legacy:
9050 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9051 return emitRemovedIntrinsicError(DAG, DL, VT);
9052 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9053 case Intrinsic::amdgcn_rsq_clamp: {
9054 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9055 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9056
9057 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9058 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9059 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9060
9061 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9062 SDValue Tmp =
9063 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9064 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9065 DAG.getConstantFP(Min, DL, VT));
9066 }
9067 case Intrinsic::r600_read_ngroups_x:
9068 if (Subtarget->isAmdHsaOS())
9069 return emitNonHSAIntrinsicError(DAG, DL, VT);
9070
9071 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9072 SI::KernelInputOffsets::NGROUPS_X, Align(4),
9073 false);
9074 case Intrinsic::r600_read_ngroups_y:
9075 if (Subtarget->isAmdHsaOS())
9076 return emitNonHSAIntrinsicError(DAG, DL, VT);
9077
9078 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9079 SI::KernelInputOffsets::NGROUPS_Y, Align(4),
9080 false);
9081 case Intrinsic::r600_read_ngroups_z:
9082 if (Subtarget->isAmdHsaOS())
9083 return emitNonHSAIntrinsicError(DAG, DL, VT);
9084
9085 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9086 SI::KernelInputOffsets::NGROUPS_Z, Align(4),
9087 false);
9088 case Intrinsic::r600_read_local_size_x:
9089 if (Subtarget->isAmdHsaOS())
9090 return emitNonHSAIntrinsicError(DAG, DL, VT);
9091
9092 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9093 SI::KernelInputOffsets::LOCAL_SIZE_X);
9094 case Intrinsic::r600_read_local_size_y:
9095 if (Subtarget->isAmdHsaOS())
9096 return emitNonHSAIntrinsicError(DAG, DL, VT);
9097
9098 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9099 SI::KernelInputOffsets::LOCAL_SIZE_Y);
9100 case Intrinsic::r600_read_local_size_z:
9101 if (Subtarget->isAmdHsaOS())
9102 return emitNonHSAIntrinsicError(DAG, DL, VT);
9103
9104 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9105 SI::KernelInputOffsets::LOCAL_SIZE_Z);
9106 case Intrinsic::amdgcn_workgroup_id_x:
9107 return getPreloadedValue(DAG, *MFI, VT,
9108 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
9109 case Intrinsic::amdgcn_workgroup_id_y:
9110 return getPreloadedValue(DAG, *MFI, VT,
9111 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
9112 case Intrinsic::amdgcn_workgroup_id_z:
9113 return getPreloadedValue(DAG, *MFI, VT,
9114 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
9115 case Intrinsic::amdgcn_wave_id:
9116 return lowerWaveID(DAG, Op);
9117 case Intrinsic::amdgcn_lds_kernel_id: {
9118 if (MFI->isEntryFunction())
9119 return getLDSKernelId(DAG, DL);
9120 return getPreloadedValue(DAG, *MFI, VT,
9121 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
9122 }
9123 case Intrinsic::amdgcn_workitem_id_x:
9124 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9125 case Intrinsic::amdgcn_workitem_id_y:
9126 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9127 case Intrinsic::amdgcn_workitem_id_z:
9128 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9129 case Intrinsic::amdgcn_wavefrontsize:
9130 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9131 SDLoc(Op), MVT::i32);
9132 case Intrinsic::amdgcn_s_buffer_load: {
9133 unsigned CPol = Op.getConstantOperandVal(3);
9134 // s_buffer_load, because of how it's optimized, can't be volatile
9135 // so reject ones with the volatile bit set.
9136 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9137 ? AMDGPU::CPol::ALL
9138 : AMDGPU::CPol::ALL_pregfx12))
9139 return Op;
9140 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9141 Op.getOperand(3), DAG);
9142 }
9143 case Intrinsic::amdgcn_fdiv_fast:
9144 return lowerFDIV_FAST(Op, DAG);
9145 case Intrinsic::amdgcn_sin:
9146 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9147
9148 case Intrinsic::amdgcn_cos:
9149 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9150
9151 case Intrinsic::amdgcn_mul_u24:
9152 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9153 Op.getOperand(2));
9154 case Intrinsic::amdgcn_mul_i24:
9155 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9156 Op.getOperand(2));
9157
9158 case Intrinsic::amdgcn_log_clamp: {
9159 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9160 return SDValue();
9161
9162 return emitRemovedIntrinsicError(DAG, DL, VT);
9163 }
9164 case Intrinsic::amdgcn_fract:
9165 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9166
9167 case Intrinsic::amdgcn_class:
9168 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9169 Op.getOperand(2));
9170 case Intrinsic::amdgcn_div_fmas:
9171 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9172 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9173
9174 case Intrinsic::amdgcn_div_fixup:
9175 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9176 Op.getOperand(2), Op.getOperand(3));
9177
9178 case Intrinsic::amdgcn_div_scale: {
9179 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9180
9181 // Translate to the operands expected by the machine instruction. The
9182 // first parameter must be the same as the first instruction.
9183 SDValue Numerator = Op.getOperand(1);
9184 SDValue Denominator = Op.getOperand(2);
9185
9186 // Note this order is opposite of the machine instruction's operations,
9187 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9188 // intrinsic has the numerator as the first operand to match a normal
9189 // division operation.
9190
9191 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9192
9193 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9194 Denominator, Numerator);
9195 }
9196 case Intrinsic::amdgcn_icmp: {
9197 // There is a Pat that handles this variant, so return it as-is.
9198 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9199 Op.getConstantOperandVal(2) == 0 &&
9200 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9201 return Op;
9202 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9203 }
9204 case Intrinsic::amdgcn_fcmp: {
9205 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9206 }
9207 case Intrinsic::amdgcn_ballot:
9208 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9209 case Intrinsic::amdgcn_fmed3:
9210 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9211 Op.getOperand(2), Op.getOperand(3));
9212 case Intrinsic::amdgcn_fdot2:
9213 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9214 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9215 case Intrinsic::amdgcn_fmul_legacy:
9216 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9217 Op.getOperand(2));
9218 case Intrinsic::amdgcn_sffbh:
9219 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9220 case Intrinsic::amdgcn_sbfe:
9221 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9222 Op.getOperand(2), Op.getOperand(3));
9223 case Intrinsic::amdgcn_ubfe:
9224 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9225 Op.getOperand(2), Op.getOperand(3));
9226 case Intrinsic::amdgcn_cvt_pkrtz:
9227 case Intrinsic::amdgcn_cvt_pknorm_i16:
9228 case Intrinsic::amdgcn_cvt_pknorm_u16:
9229 case Intrinsic::amdgcn_cvt_pk_i16:
9230 case Intrinsic::amdgcn_cvt_pk_u16: {
9231 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9232 EVT VT = Op.getValueType();
9233 unsigned Opcode;
9234
9235 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9236 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
9237 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9238 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
9239 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9240 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
9241 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9242 Opcode = AMDGPUISD::CVT_PK_I16_I32;
9243 else
9244 Opcode = AMDGPUISD::CVT_PK_U16_U32;
9245
9246 if (isTypeLegal(VT))
9247 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9248
9249 SDValue Node =
9250 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9251 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9252 }
9253 case Intrinsic::amdgcn_fmad_ftz:
9254 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9255 Op.getOperand(2), Op.getOperand(3));
9256
9257 case Intrinsic::amdgcn_if_break:
9258 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9259 Op->getOperand(1), Op->getOperand(2)),
9260 0);
9261
9262 case Intrinsic::amdgcn_groupstaticsize: {
9263 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
9264 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9265 return Op;
9266
9267 const Module *M = MF.getFunction().getParent();
9268 const GlobalValue *GV =
9269 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9270 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9271 SIInstrInfo::MO_ABS32_LO);
9272 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9273 }
9274 case Intrinsic::amdgcn_is_shared:
9275 case Intrinsic::amdgcn_is_private: {
9276 SDLoc SL(Op);
9277 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9278 ? AMDGPUAS::LOCAL_ADDRESS
9279 : AMDGPUAS::PRIVATE_ADDRESS;
9280 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9281 SDValue SrcVec =
9282 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9283
9284 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9285 DAG.getConstant(1, SL, MVT::i32));
9286 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9287 }
9288 case Intrinsic::amdgcn_perm:
9289 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9290 Op.getOperand(2), Op.getOperand(3));
9291 case Intrinsic::amdgcn_reloc_constant: {
9292 Module *M = const_cast<Module *>(MF.getFunction().getParent());
9293 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9294 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9295 auto *RelocSymbol = cast<GlobalVariable>(
9296 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9297 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9298 SIInstrInfo::MO_ABS32_LO);
9299 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9300 }
9301 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9302 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9307 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9308 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9309 if (Op.getOperand(4).getValueType() == MVT::i32)
9310 return SDValue();
9311
9312 SDLoc SL(Op);
9313 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9314 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9315 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9316 Op.getOperand(3), IndexKeyi32);
9317 }
9318 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9319 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9320 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9321 if (Op.getOperand(6).getValueType() == MVT::i32)
9322 return SDValue();
9323
9324 SDLoc SL(Op);
9325 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9326 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9327 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9328 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9329 IndexKeyi32, Op.getOperand(7)});
9330 }
9331 case Intrinsic::amdgcn_addrspacecast_nonnull:
9332 return lowerADDRSPACECAST(Op, DAG);
9333 case Intrinsic::amdgcn_readlane:
9334 case Intrinsic::amdgcn_readfirstlane:
9335 case Intrinsic::amdgcn_writelane:
9336 case Intrinsic::amdgcn_permlane16:
9337 case Intrinsic::amdgcn_permlanex16:
9338 case Intrinsic::amdgcn_permlane64:
9339 case Intrinsic::amdgcn_set_inactive:
9340 case Intrinsic::amdgcn_set_inactive_chain_arg:
9341 case Intrinsic::amdgcn_mov_dpp8:
9342 case Intrinsic::amdgcn_update_dpp:
9343 return lowerLaneOp(*this, Op.getNode(), DAG);
9344 case Intrinsic::amdgcn_dead: {
9345 SmallVector<SDValue, 8> Poisons;
9346 for (const EVT ValTy : Op.getNode()->values())
9347 Poisons.push_back(DAG.getPOISON(ValTy));
9348 return DAG.getMergeValues(Poisons, SDLoc(Op));
9349 }
9350 default:
9351 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9352 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
9353 return lowerImage(Op, ImageDimIntr, DAG, false);
9354
9355 return Op;
9356 }
9357 }
9358
9359 // On targets not supporting constant in soffset field, turn zero to
9360 // SGPR_NULL to avoid generating an extra s_mov with zero.
selectSOffset(SDValue SOffset,SelectionDAG & DAG,const GCNSubtarget * Subtarget)9361 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
9362 const GCNSubtarget *Subtarget) {
9363 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9364 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9365 return SOffset;
9366 }
9367
lowerRawBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const9368 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9369 SelectionDAG &DAG,
9370 unsigned NewOpcode) const {
9371 SDLoc DL(Op);
9372
9373 SDValue VData = Op.getOperand(2);
9374 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9375 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9376 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9377 SDValue Ops[] = {
9378 Op.getOperand(0), // Chain
9379 VData, // vdata
9380 Rsrc, // rsrc
9381 DAG.getConstant(0, DL, MVT::i32), // vindex
9382 VOffset, // voffset
9383 SOffset, // soffset
9384 Offset, // offset
9385 Op.getOperand(6), // cachepolicy
9386 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9387 };
9388
9389 auto *M = cast<MemSDNode>(Op);
9390
9391 EVT MemVT = VData.getValueType();
9392 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9393 M->getMemOperand());
9394 }
9395
9396 SDValue
lowerStructBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const9397 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9398 unsigned NewOpcode) const {
9399 SDLoc DL(Op);
9400
9401 SDValue VData = Op.getOperand(2);
9402 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9403 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9404 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9405 SDValue Ops[] = {
9406 Op.getOperand(0), // Chain
9407 VData, // vdata
9408 Rsrc, // rsrc
9409 Op.getOperand(4), // vindex
9410 VOffset, // voffset
9411 SOffset, // soffset
9412 Offset, // offset
9413 Op.getOperand(7), // cachepolicy
9414 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9415 };
9416
9417 auto *M = cast<MemSDNode>(Op);
9418
9419 EVT MemVT = VData.getValueType();
9420 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9421 M->getMemOperand());
9422 }
9423
LowerINTRINSIC_W_CHAIN(SDValue Op,SelectionDAG & DAG) const9424 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9425 SelectionDAG &DAG) const {
9426 unsigned IntrID = Op.getConstantOperandVal(1);
9427 SDLoc DL(Op);
9428
9429 switch (IntrID) {
9430 case Intrinsic::amdgcn_ds_ordered_add:
9431 case Intrinsic::amdgcn_ds_ordered_swap: {
9432 MemSDNode *M = cast<MemSDNode>(Op);
9433 SDValue Chain = M->getOperand(0);
9434 SDValue M0 = M->getOperand(2);
9435 SDValue Value = M->getOperand(3);
9436 unsigned IndexOperand = M->getConstantOperandVal(7);
9437 unsigned WaveRelease = M->getConstantOperandVal(8);
9438 unsigned WaveDone = M->getConstantOperandVal(9);
9439
9440 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9441 IndexOperand &= ~0x3f;
9442 unsigned CountDw = 0;
9443
9444 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9445 CountDw = (IndexOperand >> 24) & 0xf;
9446 IndexOperand &= ~(0xf << 24);
9447
9448 if (CountDw < 1 || CountDw > 4) {
9449 const Function &Fn = DAG.getMachineFunction().getFunction();
9450 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9451 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9452 DL.getDebugLoc()));
9453 CountDw = 1;
9454 }
9455 }
9456
9457 if (IndexOperand) {
9458 const Function &Fn = DAG.getMachineFunction().getFunction();
9459 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9460 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9461 }
9462
9463 if (WaveDone && !WaveRelease) {
9464 // TODO: Move this to IR verifier
9465 const Function &Fn = DAG.getMachineFunction().getFunction();
9466 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9467 Fn, "ds_ordered_count: wave_done requires wave_release",
9468 DL.getDebugLoc()));
9469 }
9470
9471 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9472 unsigned ShaderType =
9473 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
9474 unsigned Offset0 = OrderedCountIndex << 2;
9475 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9476
9477 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9478 Offset1 |= (CountDw - 1) << 6;
9479
9480 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9481 Offset1 |= ShaderType << 2;
9482
9483 unsigned Offset = Offset0 | (Offset1 << 8);
9484
9485 SDValue Ops[] = {
9486 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9487 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9488 };
9489 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
9490 M->getVTList(), Ops, M->getMemoryVT(),
9491 M->getMemOperand());
9492 }
9493 case Intrinsic::amdgcn_raw_buffer_load:
9494 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9495 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9496 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9497 case Intrinsic::amdgcn_raw_buffer_load_format:
9498 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9499 const bool IsFormat =
9500 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9501 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9502
9503 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9504 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9505 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9506 SDValue Ops[] = {
9507 Op.getOperand(0), // Chain
9508 Rsrc, // rsrc
9509 DAG.getConstant(0, DL, MVT::i32), // vindex
9510 VOffset, // voffset
9511 SOffset, // soffset
9512 Offset, // offset
9513 Op.getOperand(5), // cachepolicy, swizzled buffer
9514 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9515 };
9516
9517 auto *M = cast<MemSDNode>(Op);
9518 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9519 }
9520 case Intrinsic::amdgcn_struct_buffer_load:
9521 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9522 case Intrinsic::amdgcn_struct_buffer_load_format:
9523 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9524 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9525 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9526 const bool IsFormat =
9527 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9528 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9529
9530 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9531 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9532 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9533 SDValue Ops[] = {
9534 Op.getOperand(0), // Chain
9535 Rsrc, // rsrc
9536 Op.getOperand(3), // vindex
9537 VOffset, // voffset
9538 SOffset, // soffset
9539 Offset, // offset
9540 Op.getOperand(6), // cachepolicy, swizzled buffer
9541 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9542 };
9543
9544 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9545 }
9546 case Intrinsic::amdgcn_raw_tbuffer_load:
9547 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9548 MemSDNode *M = cast<MemSDNode>(Op);
9549 EVT LoadVT = Op.getValueType();
9550 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9551 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9552 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9553
9554 SDValue Ops[] = {
9555 Op.getOperand(0), // Chain
9556 Rsrc, // rsrc
9557 DAG.getConstant(0, DL, MVT::i32), // vindex
9558 VOffset, // voffset
9559 SOffset, // soffset
9560 Offset, // offset
9561 Op.getOperand(5), // format
9562 Op.getOperand(6), // cachepolicy, swizzled buffer
9563 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9564 };
9565
9566 if (LoadVT.getScalarType() == MVT::f16)
9567 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9568 Ops);
9569 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9570 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9571 DAG);
9572 }
9573 case Intrinsic::amdgcn_struct_tbuffer_load:
9574 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9575 MemSDNode *M = cast<MemSDNode>(Op);
9576 EVT LoadVT = Op.getValueType();
9577 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9578 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9579 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9580
9581 SDValue Ops[] = {
9582 Op.getOperand(0), // Chain
9583 Rsrc, // rsrc
9584 Op.getOperand(3), // vindex
9585 VOffset, // voffset
9586 SOffset, // soffset
9587 Offset, // offset
9588 Op.getOperand(6), // format
9589 Op.getOperand(7), // cachepolicy, swizzled buffer
9590 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9591 };
9592
9593 if (LoadVT.getScalarType() == MVT::f16)
9594 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9595 Ops);
9596 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9597 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9598 DAG);
9599 }
9600 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9601 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9602 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9603 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9605 return lowerStructBufferAtomicIntrin(Op, DAG,
9606 AMDGPUISD::BUFFER_ATOMIC_FADD);
9607 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9608 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9609 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9610 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9611 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9612 return lowerStructBufferAtomicIntrin(Op, DAG,
9613 AMDGPUISD::BUFFER_ATOMIC_FMIN);
9614 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9615 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9616 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9617 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9618 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9619 return lowerStructBufferAtomicIntrin(Op, DAG,
9620 AMDGPUISD::BUFFER_ATOMIC_FMAX);
9621 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9623 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9624 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9625 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9626 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9627 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9629 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9630 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9631 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9632 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9633 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9635 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9636 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9638 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9639 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9640 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9641 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9642 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9644 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9645 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9647 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9648 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9650 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9651 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9653 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9654 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9656 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9657 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9658 return lowerRawBufferAtomicIntrin(Op, DAG,
9659 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9660 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9661 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9662 return lowerStructBufferAtomicIntrin(Op, DAG,
9663 AMDGPUISD::BUFFER_ATOMIC_SWAP);
9664 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9665 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9666 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9667 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9668 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9669 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9670 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9671 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9672 return lowerStructBufferAtomicIntrin(Op, DAG,
9673 AMDGPUISD::BUFFER_ATOMIC_SMIN);
9674 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9675 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9676 return lowerStructBufferAtomicIntrin(Op, DAG,
9677 AMDGPUISD::BUFFER_ATOMIC_UMIN);
9678 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9680 return lowerStructBufferAtomicIntrin(Op, DAG,
9681 AMDGPUISD::BUFFER_ATOMIC_SMAX);
9682 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9684 return lowerStructBufferAtomicIntrin(Op, DAG,
9685 AMDGPUISD::BUFFER_ATOMIC_UMAX);
9686 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9687 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9688 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9689 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9690 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9691 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9692 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9694 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9695 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9697 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9698 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9700 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9701 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9702 return lowerStructBufferAtomicIntrin(Op, DAG,
9703 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9704
9705 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9706 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9707 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9708 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9709 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9710 SDValue Ops[] = {
9711 Op.getOperand(0), // Chain
9712 Op.getOperand(2), // src
9713 Op.getOperand(3), // cmp
9714 Rsrc, // rsrc
9715 DAG.getConstant(0, DL, MVT::i32), // vindex
9716 VOffset, // voffset
9717 SOffset, // soffset
9718 Offset, // offset
9719 Op.getOperand(7), // cachepolicy
9720 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9721 };
9722 EVT VT = Op.getValueType();
9723 auto *M = cast<MemSDNode>(Op);
9724
9725 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9726 Op->getVTList(), Ops, VT,
9727 M->getMemOperand());
9728 }
9729 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9730 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9731 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9732 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9733 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9734 SDValue Ops[] = {
9735 Op.getOperand(0), // Chain
9736 Op.getOperand(2), // src
9737 Op.getOperand(3), // cmp
9738 Rsrc, // rsrc
9739 Op.getOperand(5), // vindex
9740 VOffset, // voffset
9741 SOffset, // soffset
9742 Offset, // offset
9743 Op.getOperand(8), // cachepolicy
9744 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9745 };
9746 EVT VT = Op.getValueType();
9747 auto *M = cast<MemSDNode>(Op);
9748
9749 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9750 Op->getVTList(), Ops, VT,
9751 M->getMemOperand());
9752 }
9753 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
9754 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
9755 MemSDNode *M = cast<MemSDNode>(Op);
9756 SDValue NodePtr = M->getOperand(2);
9757 SDValue RayExtent = M->getOperand(3);
9758 SDValue InstanceMask = M->getOperand(4);
9759 SDValue RayOrigin = M->getOperand(5);
9760 SDValue RayDir = M->getOperand(6);
9761 SDValue Offsets = M->getOperand(7);
9762 SDValue TDescr = M->getOperand(8);
9763
9764 assert(NodePtr.getValueType() == MVT::i64);
9765 assert(RayDir.getValueType() == MVT::v3f32);
9766
9767 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
9768 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9769 return SDValue();
9770 }
9771
9772 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
9773 const unsigned NumVDataDwords = 10;
9774 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
9775 int Opcode = AMDGPU::getMIMGOpcode(
9776 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
9777 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
9778 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
9779 assert(Opcode != -1);
9780
9781 SmallVector<SDValue, 7> Ops;
9782 Ops.push_back(NodePtr);
9783 Ops.push_back(DAG.getBuildVector(
9784 MVT::v2i32, DL,
9785 {DAG.getBitcast(MVT::i32, RayExtent),
9786 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
9787 Ops.push_back(RayOrigin);
9788 Ops.push_back(RayDir);
9789 Ops.push_back(Offsets);
9790 Ops.push_back(TDescr);
9791 Ops.push_back(M->getChain());
9792
9793 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9794 MachineMemOperand *MemRef = M->getMemOperand();
9795 DAG.setNodeMemRefs(NewNode, {MemRef});
9796 return SDValue(NewNode, 0);
9797 }
9798 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9799 MemSDNode *M = cast<MemSDNode>(Op);
9800 SDValue NodePtr = M->getOperand(2);
9801 SDValue RayExtent = M->getOperand(3);
9802 SDValue RayOrigin = M->getOperand(4);
9803 SDValue RayDir = M->getOperand(5);
9804 SDValue RayInvDir = M->getOperand(6);
9805 SDValue TDescr = M->getOperand(7);
9806
9807 assert(NodePtr.getValueType() == MVT::i32 ||
9808 NodePtr.getValueType() == MVT::i64);
9809 assert(RayDir.getValueType() == MVT::v3f16 ||
9810 RayDir.getValueType() == MVT::v3f32);
9811
9812 if (!Subtarget->hasGFX10_AEncoding()) {
9813 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9814 return SDValue();
9815 }
9816
9817 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9818 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9819 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9820 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9821 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9822 const unsigned NumVDataDwords = 4;
9823 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9824 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9825 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9826 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9827 IsGFX12Plus;
9828 const unsigned BaseOpcodes[2][2] = {
9829 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9830 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9831 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9832 int Opcode;
9833 if (UseNSA) {
9834 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9835 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9836 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9837 : AMDGPU::MIMGEncGfx10NSA,
9838 NumVDataDwords, NumVAddrDwords);
9839 } else {
9840 assert(!IsGFX12Plus);
9841 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9842 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9843 : AMDGPU::MIMGEncGfx10Default,
9844 NumVDataDwords, NumVAddrDwords);
9845 }
9846 assert(Opcode != -1);
9847
9848 SmallVector<SDValue, 16> Ops;
9849
9850 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9851 SmallVector<SDValue, 3> Lanes;
9852 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9853 if (Lanes[0].getValueSizeInBits() == 32) {
9854 for (unsigned I = 0; I < 3; ++I)
9855 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9856 } else {
9857 if (IsAligned) {
9858 Ops.push_back(DAG.getBitcast(
9859 MVT::i32,
9860 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9861 Ops.push_back(Lanes[2]);
9862 } else {
9863 SDValue Elt0 = Ops.pop_back_val();
9864 Ops.push_back(DAG.getBitcast(
9865 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9866 Ops.push_back(DAG.getBitcast(
9867 MVT::i32,
9868 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9869 }
9870 }
9871 };
9872
9873 if (UseNSA && IsGFX11Plus) {
9874 Ops.push_back(NodePtr);
9875 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9876 Ops.push_back(RayOrigin);
9877 if (IsA16) {
9878 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9879 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9880 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9881 for (unsigned I = 0; I < 3; ++I) {
9882 MergedLanes.push_back(DAG.getBitcast(
9883 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9884 {DirLanes[I], InvDirLanes[I]})));
9885 }
9886 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9887 } else {
9888 Ops.push_back(RayDir);
9889 Ops.push_back(RayInvDir);
9890 }
9891 } else {
9892 if (Is64)
9893 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9894 2);
9895 else
9896 Ops.push_back(NodePtr);
9897
9898 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9899 packLanes(RayOrigin, true);
9900 packLanes(RayDir, true);
9901 packLanes(RayInvDir, false);
9902 }
9903
9904 if (!UseNSA) {
9905 // Build a single vector containing all the operands so far prepared.
9906 if (NumVAddrDwords > 12) {
9907 SDValue Undef = DAG.getPOISON(MVT::i32);
9908 Ops.append(16 - Ops.size(), Undef);
9909 }
9910 assert(Ops.size() >= 8 && Ops.size() <= 12);
9911 SDValue MergedOps =
9912 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9913 Ops.clear();
9914 Ops.push_back(MergedOps);
9915 }
9916
9917 Ops.push_back(TDescr);
9918 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9919 Ops.push_back(M->getChain());
9920
9921 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9922 MachineMemOperand *MemRef = M->getMemOperand();
9923 DAG.setNodeMemRefs(NewNode, {MemRef});
9924 return SDValue(NewNode, 0);
9925 }
9926 case Intrinsic::amdgcn_global_atomic_fmin_num:
9927 case Intrinsic::amdgcn_global_atomic_fmax_num:
9928 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9929 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9930 MemSDNode *M = cast<MemSDNode>(Op);
9931 SDValue Ops[] = {
9932 M->getOperand(0), // Chain
9933 M->getOperand(2), // Ptr
9934 M->getOperand(3) // Value
9935 };
9936 unsigned Opcode = 0;
9937 switch (IntrID) {
9938 case Intrinsic::amdgcn_global_atomic_fmin_num:
9939 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9940 Opcode = ISD::ATOMIC_LOAD_FMIN;
9941 break;
9942 }
9943 case Intrinsic::amdgcn_global_atomic_fmax_num:
9944 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9945 Opcode = ISD::ATOMIC_LOAD_FMAX;
9946 break;
9947 }
9948 default:
9949 llvm_unreachable("unhandled atomic opcode");
9950 }
9951 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9952 Ops, M->getMemOperand());
9953 }
9954 case Intrinsic::amdgcn_s_get_barrier_state:
9955 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9956 SDValue Chain = Op->getOperand(0);
9957 SmallVector<SDValue, 2> Ops;
9958 unsigned Opc;
9959
9960 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9961 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9962 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9963 BarID = (BarID >> 4) & 0x3F;
9964 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9965 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9966 Ops.push_back(K);
9967 Ops.push_back(Chain);
9968 } else {
9969 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9970 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9971 SDValue M0Val;
9972 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9973 DAG.getShiftAmountConstant(4, MVT::i32, DL));
9974 M0Val = SDValue(
9975 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9976 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9977 0);
9978 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9979 } else
9980 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9981 }
9982
9983 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9984 return SDValue(NewMI, 0);
9985 }
9986 default:
9987
9988 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9989 AMDGPU::getImageDimIntrinsicInfo(IntrID))
9990 return lowerImage(Op, ImageDimIntr, DAG, true);
9991
9992 return SDValue();
9993 }
9994 }
9995
9996 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9997 // dwordx4 if on SI and handle TFE loads.
getMemIntrinsicNode(unsigned Opcode,const SDLoc & DL,SDVTList VTList,ArrayRef<SDValue> Ops,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG) const9998 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9999 SDVTList VTList,
10000 ArrayRef<SDValue> Ops, EVT MemVT,
10001 MachineMemOperand *MMO,
10002 SelectionDAG &DAG) const {
10003 LLVMContext &C = *DAG.getContext();
10004 MachineFunction &MF = DAG.getMachineFunction();
10005 EVT VT = VTList.VTs[0];
10006
10007 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10008 bool IsTFE = VTList.NumVTs == 3;
10009 if (IsTFE) {
10010 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10011 unsigned NumOpDWords = NumValueDWords + 1;
10012 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10013 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10014 MachineMemOperand *OpDWordsMMO =
10015 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10016 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10017 OpDWordsVT, OpDWordsMMO, DAG);
10018 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10019 DAG.getVectorIdxConstant(NumValueDWords, DL));
10020 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10021 SDValue ValueDWords =
10022 NumValueDWords == 1
10023 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10024 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
10025 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10026 ZeroIdx);
10027 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10028 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10029 }
10030
10031 if (!Subtarget->hasDwordx3LoadStores() &&
10032 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10033 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10034 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10035 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10036 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10037 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10038 WidenedMemVT, WidenedMMO);
10039 SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
10040 DAG.getVectorIdxConstant(0, DL));
10041 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10042 }
10043
10044 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10045 }
10046
handleD16VData(SDValue VData,SelectionDAG & DAG,bool ImageStore) const10047 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10048 bool ImageStore) const {
10049 EVT StoreVT = VData.getValueType();
10050
10051 // No change for f16 and legal vector D16 types.
10052 if (!StoreVT.isVector())
10053 return VData;
10054
10055 SDLoc DL(VData);
10056 unsigned NumElements = StoreVT.getVectorNumElements();
10057
10058 if (Subtarget->hasUnpackedD16VMem()) {
10059 // We need to unpack the packed data to store.
10060 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10061 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10062
10063 EVT EquivStoreVT =
10064 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10065 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10066 return DAG.UnrollVectorOp(ZExt.getNode());
10067 }
10068
10069 // The sq block of gfx8.1 does not estimate register use correctly for d16
10070 // image store instructions. The data operand is computed as if it were not a
10071 // d16 image instruction.
10072 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10073 // Bitcast to i16
10074 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10075 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10076
10077 // Decompose into scalars
10078 SmallVector<SDValue, 4> Elts;
10079 DAG.ExtractVectorElements(IntVData, Elts);
10080
10081 // Group pairs of i16 into v2i16 and bitcast to i32
10082 SmallVector<SDValue, 4> PackedElts;
10083 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10084 SDValue Pair =
10085 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10086 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10087 PackedElts.push_back(IntPair);
10088 }
10089 if ((NumElements % 2) == 1) {
10090 // Handle v3i16
10091 unsigned I = Elts.size() / 2;
10092 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10093 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10094 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10095 PackedElts.push_back(IntPair);
10096 }
10097
10098 // Pad using UNDEF
10099 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10100
10101 // Build final vector
10102 EVT VecVT =
10103 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10104 return DAG.getBuildVector(VecVT, DL, PackedElts);
10105 }
10106
10107 if (NumElements == 3) {
10108 EVT IntStoreVT =
10109 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
10110 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10111
10112 EVT WidenedStoreVT = EVT::getVectorVT(
10113 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10114 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10115 WidenedStoreVT.getStoreSizeInBits());
10116 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10117 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10118 }
10119
10120 assert(isTypeLegal(StoreVT));
10121 return VData;
10122 }
10123
LowerINTRINSIC_VOID(SDValue Op,SelectionDAG & DAG) const10124 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10125 SelectionDAG &DAG) const {
10126 SDLoc DL(Op);
10127 SDValue Chain = Op.getOperand(0);
10128 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10129 MachineFunction &MF = DAG.getMachineFunction();
10130
10131 switch (IntrinsicID) {
10132 case Intrinsic::amdgcn_exp_compr: {
10133 if (!Subtarget->hasCompressedExport()) {
10134 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10135 DAG.getMachineFunction().getFunction(),
10136 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10137 }
10138 SDValue Src0 = Op.getOperand(4);
10139 SDValue Src1 = Op.getOperand(5);
10140 // Hack around illegal type on SI by directly selecting it.
10141 if (isTypeLegal(Src0.getValueType()))
10142 return SDValue();
10143
10144 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10145 SDValue Undef = DAG.getPOISON(MVT::f32);
10146 const SDValue Ops[] = {
10147 Op.getOperand(2), // tgt
10148 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10149 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10150 Undef, // src2
10151 Undef, // src3
10152 Op.getOperand(7), // vm
10153 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10154 Op.getOperand(3), // en
10155 Op.getOperand(0) // Chain
10156 };
10157
10158 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10159 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10160 }
10161 case Intrinsic::amdgcn_s_barrier:
10162 case Intrinsic::amdgcn_s_barrier_signal:
10163 case Intrinsic::amdgcn_s_barrier_wait: {
10164 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10165 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
10166 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
10167 if (WGSize <= ST.getWavefrontSize()) {
10168 // If the workgroup fits in a wave, remove s_barrier_signal and lower
10169 // s_barrier/s_barrier_wait to wave_barrier.
10170 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
10171 return Op.getOperand(0);
10172 else
10173 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
10174 MVT::Other, Op.getOperand(0)),
10175 0);
10176 }
10177 }
10178
10179 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
10180 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
10181 SDValue K =
10182 DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
10183 SDValue BarSignal =
10184 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
10185 MVT::Other, K, Op.getOperand(0)),
10186 0);
10187 SDValue BarWait =
10188 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
10189 BarSignal.getValue(0)),
10190 0);
10191 return BarWait;
10192 }
10193
10194 return SDValue();
10195 };
10196
10197 case Intrinsic::amdgcn_struct_tbuffer_store:
10198 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10199 SDValue VData = Op.getOperand(2);
10200 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10201 if (IsD16)
10202 VData = handleD16VData(VData, DAG);
10203 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10204 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10205 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10206 SDValue Ops[] = {
10207 Chain,
10208 VData, // vdata
10209 Rsrc, // rsrc
10210 Op.getOperand(4), // vindex
10211 VOffset, // voffset
10212 SOffset, // soffset
10213 Offset, // offset
10214 Op.getOperand(7), // format
10215 Op.getOperand(8), // cachepolicy, swizzled buffer
10216 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10217 };
10218 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10219 : AMDGPUISD::TBUFFER_STORE_FORMAT;
10220 MemSDNode *M = cast<MemSDNode>(Op);
10221 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10222 M->getMemoryVT(), M->getMemOperand());
10223 }
10224
10225 case Intrinsic::amdgcn_raw_tbuffer_store:
10226 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10227 SDValue VData = Op.getOperand(2);
10228 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10229 if (IsD16)
10230 VData = handleD16VData(VData, DAG);
10231 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10232 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10233 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10234 SDValue Ops[] = {
10235 Chain,
10236 VData, // vdata
10237 Rsrc, // rsrc
10238 DAG.getConstant(0, DL, MVT::i32), // vindex
10239 VOffset, // voffset
10240 SOffset, // soffset
10241 Offset, // offset
10242 Op.getOperand(6), // format
10243 Op.getOperand(7), // cachepolicy, swizzled buffer
10244 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10245 };
10246 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10247 : AMDGPUISD::TBUFFER_STORE_FORMAT;
10248 MemSDNode *M = cast<MemSDNode>(Op);
10249 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10250 M->getMemoryVT(), M->getMemOperand());
10251 }
10252
10253 case Intrinsic::amdgcn_raw_buffer_store:
10254 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10255 case Intrinsic::amdgcn_raw_buffer_store_format:
10256 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10257 const bool IsFormat =
10258 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10259 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10260
10261 SDValue VData = Op.getOperand(2);
10262 EVT VDataVT = VData.getValueType();
10263 EVT EltType = VDataVT.getScalarType();
10264 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10265 if (IsD16) {
10266 VData = handleD16VData(VData, DAG);
10267 VDataVT = VData.getValueType();
10268 }
10269
10270 if (!isTypeLegal(VDataVT)) {
10271 VData =
10272 DAG.getNode(ISD::BITCAST, DL,
10273 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10274 }
10275
10276 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10277 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10278 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10279 SDValue Ops[] = {
10280 Chain,
10281 VData,
10282 Rsrc,
10283 DAG.getConstant(0, DL, MVT::i32), // vindex
10284 VOffset, // voffset
10285 SOffset, // soffset
10286 Offset, // offset
10287 Op.getOperand(6), // cachepolicy, swizzled buffer
10288 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10289 };
10290 unsigned Opc =
10291 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
10292 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10293 MemSDNode *M = cast<MemSDNode>(Op);
10294
10295 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10296 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10297 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10298
10299 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10300 M->getMemoryVT(), M->getMemOperand());
10301 }
10302
10303 case Intrinsic::amdgcn_struct_buffer_store:
10304 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10305 case Intrinsic::amdgcn_struct_buffer_store_format:
10306 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10307 const bool IsFormat =
10308 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10309 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10310
10311 SDValue VData = Op.getOperand(2);
10312 EVT VDataVT = VData.getValueType();
10313 EVT EltType = VDataVT.getScalarType();
10314 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10315
10316 if (IsD16) {
10317 VData = handleD16VData(VData, DAG);
10318 VDataVT = VData.getValueType();
10319 }
10320
10321 if (!isTypeLegal(VDataVT)) {
10322 VData =
10323 DAG.getNode(ISD::BITCAST, DL,
10324 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10325 }
10326
10327 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10328 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10329 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10330 SDValue Ops[] = {
10331 Chain,
10332 VData,
10333 Rsrc,
10334 Op.getOperand(4), // vindex
10335 VOffset, // voffset
10336 SOffset, // soffset
10337 Offset, // offset
10338 Op.getOperand(7), // cachepolicy, swizzled buffer
10339 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10340 };
10341 unsigned Opc =
10342 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
10343 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10344 MemSDNode *M = cast<MemSDNode>(Op);
10345
10346 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10347 EVT VDataType = VData.getValueType().getScalarType();
10348 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10349 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10350
10351 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10352 M->getMemoryVT(), M->getMemOperand());
10353 }
10354 case Intrinsic::amdgcn_raw_buffer_load_lds:
10355 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10356 case Intrinsic::amdgcn_struct_buffer_load_lds:
10357 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10358 if (!Subtarget->hasVMemToLDSLoad())
10359 return SDValue();
10360 unsigned Opc;
10361 bool HasVIndex =
10362 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10363 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10364 unsigned OpOffset = HasVIndex ? 1 : 0;
10365 SDValue VOffset = Op.getOperand(5 + OpOffset);
10366 bool HasVOffset = !isNullConstant(VOffset);
10367 unsigned Size = Op->getConstantOperandVal(4);
10368
10369 switch (Size) {
10370 default:
10371 return SDValue();
10372 case 1:
10373 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10374 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10375 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10376 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10377 break;
10378 case 2:
10379 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10380 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10381 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10382 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10383 break;
10384 case 4:
10385 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10386 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10387 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10388 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10389 break;
10390 case 12:
10391 if (!Subtarget->hasLDSLoadB96_B128())
10392 return SDValue();
10393 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10394 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10395 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10396 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10397 break;
10398 case 16:
10399 if (!Subtarget->hasLDSLoadB96_B128())
10400 return SDValue();
10401 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10402 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10403 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10404 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10405 break;
10406 }
10407
10408 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10409
10410 SmallVector<SDValue, 8> Ops;
10411
10412 if (HasVIndex && HasVOffset)
10413 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10414 {Op.getOperand(5), // VIndex
10415 VOffset}));
10416 else if (HasVIndex)
10417 Ops.push_back(Op.getOperand(5));
10418 else if (HasVOffset)
10419 Ops.push_back(VOffset);
10420
10421 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10422 Ops.push_back(Rsrc);
10423 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10424 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10425 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10426 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10427 Ops.push_back(DAG.getTargetConstant(
10428 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10429 DL, MVT::i8)); // cpol
10430 Ops.push_back(DAG.getTargetConstant(
10431 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10432 ? 1
10433 : 0,
10434 DL, MVT::i8)); // swz
10435 Ops.push_back(M0Val.getValue(0)); // Chain
10436 Ops.push_back(M0Val.getValue(1)); // Glue
10437
10438 auto *M = cast<MemSDNode>(Op);
10439 MachineMemOperand *LoadMMO = M->getMemOperand();
10440 // Don't set the offset value here because the pointer points to the base of
10441 // the buffer.
10442 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10443
10444 MachinePointerInfo StorePtrI = LoadPtrI;
10445 LoadPtrI.V = PoisonValue::get(
10446 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10447 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10448 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10449
10450 auto F = LoadMMO->getFlags() &
10451 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10452 LoadMMO =
10453 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10454 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10455
10456 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10457 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10458 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10459
10460 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10461 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10462
10463 return SDValue(Load, 0);
10464 }
10465 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10466 // for "trust me" that the remaining cases are global pointers until
10467 // such time as we can put two mem operands on an intrinsic.
10468 case Intrinsic::amdgcn_load_to_lds:
10469 case Intrinsic::amdgcn_global_load_lds: {
10470 if (!Subtarget->hasVMemToLDSLoad())
10471 return SDValue();
10472
10473 unsigned Opc;
10474 unsigned Size = Op->getConstantOperandVal(4);
10475 switch (Size) {
10476 default:
10477 return SDValue();
10478 case 1:
10479 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10480 break;
10481 case 2:
10482 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10483 break;
10484 case 4:
10485 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10486 break;
10487 case 12:
10488 if (!Subtarget->hasLDSLoadB96_B128())
10489 return SDValue();
10490 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10491 break;
10492 case 16:
10493 if (!Subtarget->hasLDSLoadB96_B128())
10494 return SDValue();
10495 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10496 break;
10497 }
10498
10499 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10500
10501 SmallVector<SDValue, 6> Ops;
10502
10503 SDValue Addr = Op.getOperand(2); // Global ptr
10504 SDValue VOffset;
10505 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10506 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10507 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10508 SDValue LHS = Addr.getOperand(0);
10509 SDValue RHS = Addr.getOperand(1);
10510
10511 if (LHS->isDivergent())
10512 std::swap(LHS, RHS);
10513
10514 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10515 RHS.getOperand(0).getValueType() == MVT::i32) {
10516 // add (i64 sgpr), (zero_extend (i32 vgpr))
10517 Addr = LHS;
10518 VOffset = RHS.getOperand(0);
10519 }
10520 }
10521
10522 Ops.push_back(Addr);
10523 if (!Addr->isDivergent()) {
10524 Opc = AMDGPU::getGlobalSaddrOp(Opc);
10525 if (!VOffset)
10526 VOffset =
10527 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10528 DAG.getTargetConstant(0, DL, MVT::i32)),
10529 0);
10530 Ops.push_back(VOffset);
10531 }
10532
10533 Ops.push_back(Op.getOperand(5)); // Offset
10534 Ops.push_back(Op.getOperand(6)); // CPol
10535 Ops.push_back(M0Val.getValue(0)); // Chain
10536 Ops.push_back(M0Val.getValue(1)); // Glue
10537
10538 auto *M = cast<MemSDNode>(Op);
10539 MachineMemOperand *LoadMMO = M->getMemOperand();
10540 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10541 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10542 MachinePointerInfo StorePtrI = LoadPtrI;
10543 LoadPtrI.V = PoisonValue::get(
10544 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10545 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10546 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10547 auto F = LoadMMO->getFlags() &
10548 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10549 LoadMMO =
10550 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10551 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10552 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10553 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10554 LoadMMO->getAAInfo());
10555
10556 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10557 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10558
10559 return SDValue(Load, 0);
10560 }
10561 case Intrinsic::amdgcn_end_cf:
10562 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10563 Op->getOperand(2), Chain),
10564 0);
10565 case Intrinsic::amdgcn_s_barrier_signal_var: {
10566 // these two intrinsics have two operands: barrier pointer and member count
10567 SDValue Chain = Op->getOperand(0);
10568 SmallVector<SDValue, 2> Ops;
10569 SDValue BarOp = Op->getOperand(2);
10570 SDValue CntOp = Op->getOperand(3);
10571 SDValue M0Val;
10572 // extract the BarrierID from bits 4-9 of BarOp
10573 SDValue BarID;
10574 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10575 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10576 BarID =
10577 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10578 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10579 0);
10580 // Member count should be put into M0[ShAmt:+6]
10581 // Barrier ID should be put into M0[5:0]
10582 M0Val =
10583 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10584 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10585 0);
10586 constexpr unsigned ShAmt = 16;
10587 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10588 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10589
10590 M0Val = SDValue(
10591 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10592
10593 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10594
10595 auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
10596 Op->getVTList(), Ops);
10597 return SDValue(NewMI, 0);
10598 }
10599 case Intrinsic::amdgcn_s_prefetch_data: {
10600 // For non-global address space preserve the chain and remove the call.
10601 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10602 return Op.getOperand(0);
10603 return Op;
10604 }
10605 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10606 SDValue Ops[] = {
10607 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10608 Op.getOperand(3), // offset
10609 Op.getOperand(4), // length
10610 };
10611
10612 MemSDNode *M = cast<MemSDNode>(Op);
10613 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
10614 Op->getVTList(), Ops, M->getMemoryVT(),
10615 M->getMemOperand());
10616 }
10617 default: {
10618 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10619 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
10620 return lowerImage(Op, ImageDimIntr, DAG, true);
10621
10622 return Op;
10623 }
10624 }
10625 }
10626
shouldPreservePtrArith(const Function & F,EVT PtrVT) const10627 bool SITargetLowering::shouldPreservePtrArith(const Function &F,
10628 EVT PtrVT) const {
10629 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10630 }
10631
10632 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10633 // offset (the offset that is included in bounds checking and swizzling, to be
10634 // split between the instruction's voffset and immoffset fields) and soffset
10635 // (the offset that is excluded from bounds checking and swizzling, to go in
10636 // the instruction's soffset field). This function takes the first kind of
10637 // offset and figures out how to split it between voffset and immoffset.
10638 std::pair<SDValue, SDValue>
splitBufferOffsets(SDValue Offset,SelectionDAG & DAG) const10639 SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10640 SDLoc DL(Offset);
10641 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10642 SDValue N0 = Offset;
10643 ConstantSDNode *C1 = nullptr;
10644
10645 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10646 N0 = SDValue();
10647 else if (DAG.isBaseWithConstantOffset(N0)) {
10648 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10649 N0 = N0.getOperand(0);
10650 }
10651
10652 if (C1) {
10653 unsigned ImmOffset = C1->getZExtValue();
10654 // If the immediate value is too big for the immoffset field, put only bits
10655 // that would normally fit in the immoffset field. The remaining value that
10656 // is copied/added for the voffset field is a large power of 2, and it
10657 // stands more chance of being CSEd with the copy/add for another similar
10658 // load/store.
10659 // However, do not do that rounding down if that is a negative
10660 // number, as it appears to be illegal to have a negative offset in the
10661 // vgpr, even if adding the immediate offset makes it positive.
10662 unsigned Overflow = ImmOffset & ~MaxImm;
10663 ImmOffset -= Overflow;
10664 if ((int32_t)Overflow < 0) {
10665 Overflow += ImmOffset;
10666 ImmOffset = 0;
10667 }
10668 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10669 if (Overflow) {
10670 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10671 if (!N0)
10672 N0 = OverflowVal;
10673 else {
10674 SDValue Ops[] = {N0, OverflowVal};
10675 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10676 }
10677 }
10678 }
10679 if (!N0)
10680 N0 = DAG.getConstant(0, DL, MVT::i32);
10681 if (!C1)
10682 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10683 return {N0, SDValue(C1, 0)};
10684 }
10685
10686 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10687 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10688 // pointed to by Offsets.
setBufferOffsets(SDValue CombinedOffset,SelectionDAG & DAG,SDValue * Offsets,Align Alignment) const10689 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10690 SelectionDAG &DAG, SDValue *Offsets,
10691 Align Alignment) const {
10692 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10693 SDLoc DL(CombinedOffset);
10694 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10695 uint32_t Imm = C->getZExtValue();
10696 uint32_t SOffset, ImmOffset;
10697 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10698 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10699 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10700 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10701 return;
10702 }
10703 }
10704 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10705 SDValue N0 = CombinedOffset.getOperand(0);
10706 SDValue N1 = CombinedOffset.getOperand(1);
10707 uint32_t SOffset, ImmOffset;
10708 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10709 if (Offset >= 0 &&
10710 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10711 Offsets[0] = N0;
10712 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10713 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10714 return;
10715 }
10716 }
10717
10718 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10719 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10720 : DAG.getConstant(0, DL, MVT::i32);
10721
10722 Offsets[0] = CombinedOffset;
10723 Offsets[1] = SOffsetZero;
10724 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10725 }
10726
bufferRsrcPtrToVector(SDValue MaybePointer,SelectionDAG & DAG) const10727 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10728 SelectionDAG &DAG) const {
10729 if (!MaybePointer.getValueType().isScalarInteger())
10730 return MaybePointer;
10731
10732 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10733 return Rsrc;
10734 }
10735
10736 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10737 // specified in the intrinsic.
lowerPointerAsRsrcIntrin(SDNode * Op,SelectionDAG & DAG) const10738 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10739 SelectionDAG &DAG) const {
10740 SDLoc Loc(Op);
10741
10742 SDValue Pointer = Op->getOperand(1);
10743 SDValue Stride = Op->getOperand(2);
10744 SDValue NumRecords = Op->getOperand(3);
10745 SDValue Flags = Op->getOperand(4);
10746
10747 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10748 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10749 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10750 std::optional<uint32_t> ConstStride = std::nullopt;
10751 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10752 ConstStride = ConstNode->getZExtValue();
10753
10754 SDValue NewHighHalf = Masked;
10755 if (!ConstStride || *ConstStride != 0) {
10756 SDValue ShiftedStride;
10757 if (ConstStride) {
10758 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10759 } else {
10760 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10761 ShiftedStride =
10762 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10763 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10764 }
10765 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10766 }
10767
10768 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10769 NewHighHalf, NumRecords, Flags);
10770 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10771 return RsrcPtr;
10772 }
10773
10774 // Handle 8 bit and 16 bit buffer loads
handleByteShortBufferLoads(SelectionDAG & DAG,EVT LoadVT,SDLoc DL,ArrayRef<SDValue> Ops,MachineMemOperand * MMO,bool IsTFE) const10775 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10776 EVT LoadVT, SDLoc DL,
10777 ArrayRef<SDValue> Ops,
10778 MachineMemOperand *MMO,
10779 bool IsTFE) const {
10780 EVT IntVT = LoadVT.changeTypeToInteger();
10781
10782 if (IsTFE) {
10783 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10784 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10785 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10786 MachineFunction &MF = DAG.getMachineFunction();
10787 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10788 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10789 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10790 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10791 DAG.getConstant(1, DL, MVT::i32));
10792 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10793 DAG.getConstant(0, DL, MVT::i32));
10794 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10795 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10796 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10797 }
10798
10799 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10800 ? AMDGPUISD::BUFFER_LOAD_UBYTE
10801 : AMDGPUISD::BUFFER_LOAD_USHORT;
10802
10803 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10804 SDValue BufferLoad =
10805 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10806 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10807 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10808
10809 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10810 }
10811
10812 // Handle 8 bit and 16 bit buffer stores
handleByteShortBufferStores(SelectionDAG & DAG,EVT VDataType,SDLoc DL,SDValue Ops[],MemSDNode * M) const10813 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10814 EVT VDataType, SDLoc DL,
10815 SDValue Ops[],
10816 MemSDNode *M) const {
10817 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10818 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10819
10820 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10821 Ops[1] = BufferStoreExt;
10822 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10823 : AMDGPUISD::BUFFER_STORE_SHORT;
10824 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10825 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10826 M->getMemOperand());
10827 }
10828
getLoadExtOrTrunc(SelectionDAG & DAG,ISD::LoadExtType ExtType,SDValue Op,const SDLoc & SL,EVT VT)10829 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
10830 SDValue Op, const SDLoc &SL, EVT VT) {
10831 if (VT.bitsLT(Op.getValueType()))
10832 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10833
10834 switch (ExtType) {
10835 case ISD::SEXTLOAD:
10836 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10837 case ISD::ZEXTLOAD:
10838 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10839 case ISD::EXTLOAD:
10840 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10841 case ISD::NON_EXTLOAD:
10842 return Op;
10843 }
10844
10845 llvm_unreachable("invalid ext type");
10846 }
10847
10848 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10849 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
widenLoad(LoadSDNode * Ld,DAGCombinerInfo & DCI) const10850 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10851 DAGCombinerInfo &DCI) const {
10852 SelectionDAG &DAG = DCI.DAG;
10853 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10854 return SDValue();
10855
10856 // FIXME: Constant loads should all be marked invariant.
10857 unsigned AS = Ld->getAddressSpace();
10858 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10859 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10860 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10861 return SDValue();
10862
10863 // Don't do this early, since it may interfere with adjacent load merging for
10864 // illegal types. We can avoid losing alignment information for exotic types
10865 // pre-legalize.
10866 EVT MemVT = Ld->getMemoryVT();
10867 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10868 MemVT.getSizeInBits() >= 32)
10869 return SDValue();
10870
10871 SDLoc SL(Ld);
10872
10873 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10874 "unexpected vector extload");
10875
10876 // TODO: Drop only high part of range.
10877 SDValue Ptr = Ld->getBasePtr();
10878 SDValue NewLoad = DAG.getLoad(
10879 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10880 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10881 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10882 nullptr); // Drop ranges
10883
10884 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10885 if (MemVT.isFloatingPoint()) {
10886 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10887 "unexpected fp extload");
10888 TruncVT = MemVT.changeTypeToInteger();
10889 }
10890
10891 SDValue Cvt = NewLoad;
10892 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10893 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10894 DAG.getValueType(TruncVT));
10895 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10896 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10897 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10898 } else {
10899 assert(Ld->getExtensionType() == ISD::EXTLOAD);
10900 }
10901
10902 EVT VT = Ld->getValueType(0);
10903 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10904
10905 DCI.AddToWorklist(Cvt.getNode());
10906
10907 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10908 // the appropriate extension from the 32-bit load.
10909 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10910 DCI.AddToWorklist(Cvt.getNode());
10911
10912 // Handle conversion back to floating point if necessary.
10913 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10914
10915 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10916 }
10917
addressMayBeAccessedAsPrivate(const MachineMemOperand * MMO,const SIMachineFunctionInfo & Info)10918 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10919 const SIMachineFunctionInfo &Info) {
10920 // TODO: Should check if the address can definitely not access stack.
10921 if (Info.isEntryFunction())
10922 return Info.getUserSGPRInfo().hasFlatScratchInit();
10923 return true;
10924 }
10925
LowerLOAD(SDValue Op,SelectionDAG & DAG) const10926 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10927 SDLoc DL(Op);
10928 LoadSDNode *Load = cast<LoadSDNode>(Op);
10929 ISD::LoadExtType ExtType = Load->getExtensionType();
10930 EVT MemVT = Load->getMemoryVT();
10931 MachineMemOperand *MMO = Load->getMemOperand();
10932
10933 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10934 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10935 return SDValue();
10936
10937 // FIXME: Copied from PPC
10938 // First, load into 32 bits, then truncate to 1 bit.
10939
10940 SDValue Chain = Load->getChain();
10941 SDValue BasePtr = Load->getBasePtr();
10942
10943 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10944
10945 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10946 RealMemVT, MMO);
10947
10948 if (!MemVT.isVector()) {
10949 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10950 NewLD.getValue(1)};
10951
10952 return DAG.getMergeValues(Ops, DL);
10953 }
10954
10955 SmallVector<SDValue, 3> Elts;
10956 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10957 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10958 DAG.getConstant(I, DL, MVT::i32));
10959
10960 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10961 }
10962
10963 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10964
10965 return DAG.getMergeValues(Ops, DL);
10966 }
10967
10968 if (!MemVT.isVector())
10969 return SDValue();
10970
10971 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10972 "Custom lowering for non-i32 vectors hasn't been implemented.");
10973
10974 Align Alignment = Load->getAlign();
10975 unsigned AS = Load->getAddressSpace();
10976 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10977 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10978 return SplitVectorLoad(Op, DAG);
10979 }
10980
10981 MachineFunction &MF = DAG.getMachineFunction();
10982 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10983 // If there is a possibility that flat instruction access scratch memory
10984 // then we need to use the same legalization rules we use for private.
10985 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10986 !Subtarget->hasMultiDwordFlatScratchAddressing())
10987 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10988 ? AMDGPUAS::PRIVATE_ADDRESS
10989 : AMDGPUAS::GLOBAL_ADDRESS;
10990
10991 unsigned NumElements = MemVT.getVectorNumElements();
10992
10993 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10994 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10995 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10996 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10997 isMemOpHasNoClobberedMemOperand(Load))) {
10998 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
10999 Alignment >= Align(4) && NumElements < 32) {
11000 if (MemVT.isPow2VectorType() ||
11001 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11002 return SDValue();
11003 return WidenOrSplitVectorLoad(Op, DAG);
11004 }
11005 // Non-uniform loads will be selected to MUBUF instructions, so they
11006 // have the same legalization requirements as global and private
11007 // loads.
11008 //
11009 }
11010 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11011 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
11012 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11013 if (NumElements > 4)
11014 return SplitVectorLoad(Op, DAG);
11015 // v3 loads not supported on SI.
11016 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11017 return WidenOrSplitVectorLoad(Op, DAG);
11018
11019 // v3 and v4 loads are supported for private and global memory.
11020 return SDValue();
11021 }
11022 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11023 // Depending on the setting of the private_element_size field in the
11024 // resource descriptor, we can only make private accesses up to a certain
11025 // size.
11026 switch (Subtarget->getMaxPrivateElementSize()) {
11027 case 4: {
11028 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11029 return DAG.getMergeValues({Op0, Op1}, DL);
11030 }
11031 case 8:
11032 if (NumElements > 2)
11033 return SplitVectorLoad(Op, DAG);
11034 return SDValue();
11035 case 16:
11036 // Same as global/flat
11037 if (NumElements > 4)
11038 return SplitVectorLoad(Op, DAG);
11039 // v3 loads not supported on SI.
11040 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11041 return WidenOrSplitVectorLoad(Op, DAG);
11042
11043 return SDValue();
11044 default:
11045 llvm_unreachable("unsupported private_element_size");
11046 }
11047 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11048 unsigned Fast = 0;
11049 auto Flags = Load->getMemOperand()->getFlags();
11050 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
11051 Load->getAlign(), Flags, &Fast) &&
11052 Fast > 1)
11053 return SDValue();
11054
11055 if (MemVT.isVector())
11056 return SplitVectorLoad(Op, DAG);
11057 }
11058
11059 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
11060 MemVT, *Load->getMemOperand())) {
11061 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11062 return DAG.getMergeValues({Op0, Op1}, DL);
11063 }
11064
11065 return SDValue();
11066 }
11067
LowerSELECT(SDValue Op,SelectionDAG & DAG) const11068 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11069 EVT VT = Op.getValueType();
11070 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11071 VT.getSizeInBits() == 512)
11072 return splitTernaryVectorOp(Op, DAG);
11073
11074 assert(VT.getSizeInBits() == 64);
11075
11076 SDLoc DL(Op);
11077 SDValue Cond = Op.getOperand(0);
11078
11079 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11080 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11081
11082 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11083 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11084
11085 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11086 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11087
11088 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11089
11090 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11091 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11092
11093 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11094
11095 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11096 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11097 }
11098
11099 // Catch division cases where we can use shortcuts with rcp and rsq
11100 // instructions.
lowerFastUnsafeFDIV(SDValue Op,SelectionDAG & DAG) const11101 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11102 SelectionDAG &DAG) const {
11103 SDLoc SL(Op);
11104 SDValue LHS = Op.getOperand(0);
11105 SDValue RHS = Op.getOperand(1);
11106 EVT VT = Op.getValueType();
11107 const SDNodeFlags Flags = Op->getFlags();
11108
11109 bool AllowInaccurateRcp =
11110 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11111
11112 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11113 // Without !fpmath accuracy information, we can't do more because we don't
11114 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11115 // f16 is always accurate enough
11116 if (!AllowInaccurateRcp && VT != MVT::f16)
11117 return SDValue();
11118
11119 if (CLHS->isExactlyValue(1.0)) {
11120 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11121 // the CI documentation has a worst case error of 1 ulp.
11122 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11123 // use it as long as we aren't trying to use denormals.
11124 //
11125 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11126
11127 // 1.0 / sqrt(x) -> rsq(x)
11128
11129 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
11130 // error seems really high at 2^29 ULP.
11131 // 1.0 / x -> rcp(x)
11132 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11133 }
11134
11135 // Same as for 1.0, but expand the sign out of the constant.
11136 if (CLHS->isExactlyValue(-1.0)) {
11137 // -1.0 / x -> rcp (fneg x)
11138 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11139 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11140 }
11141 }
11142
11143 // For f16 require afn or arcp.
11144 // For f32 require afn.
11145 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
11146 return SDValue();
11147
11148 // Turn into multiply by the reciprocal.
11149 // x / y -> x * (1.0 / y)
11150 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11151 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11152 }
11153
lowerFastUnsafeFDIV64(SDValue Op,SelectionDAG & DAG) const11154 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11155 SelectionDAG &DAG) const {
11156 SDLoc SL(Op);
11157 SDValue X = Op.getOperand(0);
11158 SDValue Y = Op.getOperand(1);
11159 EVT VT = Op.getValueType();
11160 const SDNodeFlags Flags = Op->getFlags();
11161
11162 bool AllowInaccurateDiv =
11163 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11164 if (!AllowInaccurateDiv)
11165 return SDValue();
11166
11167 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11168 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11169
11170 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11171 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11172
11173 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11174 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11175 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11176 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11177 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11178 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11179 }
11180
getFPBinOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue GlueChain,SDNodeFlags Flags)11181 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11182 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11183 SDNodeFlags Flags) {
11184 if (GlueChain->getNumValues() <= 1) {
11185 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11186 }
11187
11188 assert(GlueChain->getNumValues() == 3);
11189
11190 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11191 switch (Opcode) {
11192 default:
11193 llvm_unreachable("no chain equivalent for opcode");
11194 case ISD::FMUL:
11195 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11196 break;
11197 }
11198
11199 return DAG.getNode(Opcode, SL, VTList,
11200 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11201 Flags);
11202 }
11203
getFPTernOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue C,SDValue GlueChain,SDNodeFlags Flags)11204 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11205 EVT VT, SDValue A, SDValue B, SDValue C,
11206 SDValue GlueChain, SDNodeFlags Flags) {
11207 if (GlueChain->getNumValues() <= 1) {
11208 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11209 }
11210
11211 assert(GlueChain->getNumValues() == 3);
11212
11213 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11214 switch (Opcode) {
11215 default:
11216 llvm_unreachable("no chain equivalent for opcode");
11217 case ISD::FMA:
11218 Opcode = AMDGPUISD::FMA_W_CHAIN;
11219 break;
11220 }
11221
11222 return DAG.getNode(Opcode, SL, VTList,
11223 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11224 Flags);
11225 }
11226
LowerFDIV16(SDValue Op,SelectionDAG & DAG) const11227 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11228 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11229 return FastLowered;
11230
11231 SDLoc SL(Op);
11232 SDValue LHS = Op.getOperand(0);
11233 SDValue RHS = Op.getOperand(1);
11234
11235 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11236 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11237 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11238 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11239 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11240 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11241 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11242 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11243 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11244 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11245 // q16.u = opx(V_CVT_F16_F32, q32.u);
11246 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11247
11248 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11249 unsigned FMADOpCode =
11250 isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
11251
11252 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11253 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11254 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11255 SDValue Rcp =
11256 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11257 SDValue Quot =
11258 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11259 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11260 Op->getFlags());
11261 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11262 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11263 Op->getFlags());
11264 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11265 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11266 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11267 DAG.getConstant(0xff800000, SL, MVT::i32));
11268 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11269 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11270 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11271 DAG.getTargetConstant(0, SL, MVT::i32));
11272 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11273 Op->getFlags());
11274 }
11275
11276 // Faster 2.5 ULP division that does not support denormals.
lowerFDIV_FAST(SDValue Op,SelectionDAG & DAG) const11277 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11278 SDNodeFlags Flags = Op->getFlags();
11279 SDLoc SL(Op);
11280 SDValue LHS = Op.getOperand(1);
11281 SDValue RHS = Op.getOperand(2);
11282
11283 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11284
11285 const APFloat K0Val(0x1p+96f);
11286 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11287
11288 const APFloat K1Val(0x1p-32f);
11289 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11290
11291 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11292
11293 EVT SetCCVT =
11294 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11295
11296 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11297
11298 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11299
11300 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11301
11302 // rcp does not support denormals.
11303 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11304
11305 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11306
11307 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11308 }
11309
11310 // Returns immediate value for setting the F32 denorm mode when using the
11311 // S_DENORM_MODE instruction.
getSPDenormModeValue(uint32_t SPDenormMode,SelectionDAG & DAG,const SIMachineFunctionInfo * Info,const GCNSubtarget * ST)11312 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
11313 const SIMachineFunctionInfo *Info,
11314 const GCNSubtarget *ST) {
11315 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11316 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11317 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11318 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11319 }
11320
LowerFDIV32(SDValue Op,SelectionDAG & DAG) const11321 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11322 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11323 return FastLowered;
11324
11325 // The selection matcher assumes anything with a chain selecting to a
11326 // mayRaiseFPException machine instruction. Since we're introducing a chain
11327 // here, we need to explicitly report nofpexcept for the regular fdiv
11328 // lowering.
11329 SDNodeFlags Flags = Op->getFlags();
11330 Flags.setNoFPExcept(true);
11331
11332 SDLoc SL(Op);
11333 SDValue LHS = Op.getOperand(0);
11334 SDValue RHS = Op.getOperand(1);
11335
11336 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11337
11338 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11339
11340 SDValue DenominatorScaled =
11341 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11342 SDValue NumeratorScaled =
11343 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11344
11345 // Denominator is scaled to not be denormal, so using rcp is ok.
11346 SDValue ApproxRcp =
11347 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11348 SDValue NegDivScale0 =
11349 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11350
11351 using namespace AMDGPU::Hwreg;
11352 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11353 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11354
11355 const MachineFunction &MF = DAG.getMachineFunction();
11356 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11357 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11358
11359 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11360 const bool HasDynamicDenormals =
11361 (DenormMode.Input == DenormalMode::Dynamic) ||
11362 (DenormMode.Output == DenormalMode::Dynamic);
11363
11364 SDValue SavedDenormMode;
11365
11366 if (!PreservesDenormals) {
11367 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11368 // lowering. The chain dependence is insufficient, and we need glue. We do
11369 // not need the glue variants in a strictfp function.
11370
11371 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11372
11373 SDValue Glue = DAG.getEntryNode();
11374 if (HasDynamicDenormals) {
11375 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11376 DAG.getVTList(MVT::i32, MVT::Glue),
11377 {BitField, Glue});
11378 SavedDenormMode = SDValue(GetReg, 0);
11379
11380 Glue = DAG.getMergeValues(
11381 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11382 }
11383
11384 SDNode *EnableDenorm;
11385 if (Subtarget->hasDenormModeInst()) {
11386 const SDValue EnableDenormValue =
11387 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11388
11389 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11390 EnableDenormValue)
11391 .getNode();
11392 } else {
11393 const SDValue EnableDenormValue =
11394 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11395 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11396 {EnableDenormValue, BitField, Glue});
11397 }
11398
11399 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11400 SDValue(EnableDenorm, 1)};
11401
11402 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11403 }
11404
11405 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11406 ApproxRcp, One, NegDivScale0, Flags);
11407
11408 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11409 ApproxRcp, Fma0, Flags);
11410
11411 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11412 Fma1, Flags);
11413
11414 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11415 NumeratorScaled, Mul, Flags);
11416
11417 SDValue Fma3 =
11418 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11419
11420 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11421 NumeratorScaled, Fma3, Flags);
11422
11423 if (!PreservesDenormals) {
11424 SDNode *DisableDenorm;
11425 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11426 const SDValue DisableDenormValue = getSPDenormModeValue(
11427 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11428
11429 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11430 DisableDenorm =
11431 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
11432 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
11433 .getNode();
11434 } else {
11435 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11436 const SDValue DisableDenormValue =
11437 HasDynamicDenormals
11438 ? SavedDenormMode
11439 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11440
11441 DisableDenorm = DAG.getMachineNode(
11442 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11443 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11444 }
11445
11446 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11447 SDValue(DisableDenorm, 0), DAG.getRoot());
11448 DAG.setRoot(OutputChain);
11449 }
11450
11451 SDValue Scale = NumeratorScaled.getValue(1);
11452 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11453 {Fma4, Fma1, Fma3, Scale}, Flags);
11454
11455 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11456 }
11457
LowerFDIV64(SDValue Op,SelectionDAG & DAG) const11458 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11459 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11460 return FastLowered;
11461
11462 SDLoc SL(Op);
11463 SDValue X = Op.getOperand(0);
11464 SDValue Y = Op.getOperand(1);
11465
11466 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11467
11468 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11469
11470 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11471
11472 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11473
11474 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11475
11476 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11477
11478 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11479
11480 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11481
11482 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11483
11484 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11485 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11486
11487 SDValue Fma4 =
11488 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11489
11490 SDValue Scale;
11491
11492 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11493 // Workaround a hardware bug on SI where the condition output from div_scale
11494 // is not usable.
11495
11496 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11497
11498 // Figure out if the scale to use for div_fmas.
11499 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11500 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11501 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11502 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11503
11504 SDValue NumHi =
11505 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11506 SDValue DenHi =
11507 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11508
11509 SDValue Scale0Hi =
11510 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11511 SDValue Scale1Hi =
11512 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11513
11514 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11515 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11516 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11517 } else {
11518 Scale = DivScale1.getValue(1);
11519 }
11520
11521 SDValue Fmas =
11522 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11523
11524 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11525 }
11526
LowerFDIV(SDValue Op,SelectionDAG & DAG) const11527 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11528 EVT VT = Op.getValueType();
11529
11530 if (VT == MVT::f32)
11531 return LowerFDIV32(Op, DAG);
11532
11533 if (VT == MVT::f64)
11534 return LowerFDIV64(Op, DAG);
11535
11536 if (VT == MVT::f16)
11537 return LowerFDIV16(Op, DAG);
11538
11539 llvm_unreachable("Unexpected type for fdiv");
11540 }
11541
LowerFFREXP(SDValue Op,SelectionDAG & DAG) const11542 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11543 SDLoc dl(Op);
11544 SDValue Val = Op.getOperand(0);
11545 EVT VT = Val.getValueType();
11546 EVT ResultExpVT = Op->getValueType(1);
11547 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11548
11549 SDValue Mant = DAG.getNode(
11550 ISD::INTRINSIC_WO_CHAIN, dl, VT,
11551 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11552
11553 SDValue Exp = DAG.getNode(
11554 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11555 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11556
11557 if (Subtarget->hasFractBug()) {
11558 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11559 SDValue Inf =
11560 DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT);
11561
11562 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11563 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11564 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11565 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11566 }
11567
11568 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11569 return DAG.getMergeValues({Mant, CastExp}, dl);
11570 }
11571
LowerSTORE(SDValue Op,SelectionDAG & DAG) const11572 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11573 SDLoc DL(Op);
11574 StoreSDNode *Store = cast<StoreSDNode>(Op);
11575 EVT VT = Store->getMemoryVT();
11576
11577 if (VT == MVT::i1) {
11578 return DAG.getTruncStore(
11579 Store->getChain(), DL,
11580 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11581 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11582 }
11583
11584 assert(VT.isVector() &&
11585 Store->getValue().getValueType().getScalarType() == MVT::i32);
11586
11587 unsigned AS = Store->getAddressSpace();
11588 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11589 Store->getAlign().value() < VT.getStoreSize() &&
11590 VT.getSizeInBits() > 32) {
11591 return SplitVectorStore(Op, DAG);
11592 }
11593
11594 MachineFunction &MF = DAG.getMachineFunction();
11595 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11596 // If there is a possibility that flat instruction access scratch memory
11597 // then we need to use the same legalization rules we use for private.
11598 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11599 !Subtarget->hasMultiDwordFlatScratchAddressing())
11600 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11601 ? AMDGPUAS::PRIVATE_ADDRESS
11602 : AMDGPUAS::GLOBAL_ADDRESS;
11603
11604 unsigned NumElements = VT.getVectorNumElements();
11605 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11606 if (NumElements > 4)
11607 return SplitVectorStore(Op, DAG);
11608 // v3 stores not supported on SI.
11609 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11610 return SplitVectorStore(Op, DAG);
11611
11612 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
11613 VT, *Store->getMemOperand()))
11614 return expandUnalignedStore(Store, DAG);
11615
11616 return SDValue();
11617 }
11618 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11619 switch (Subtarget->getMaxPrivateElementSize()) {
11620 case 4:
11621 return scalarizeVectorStore(Store, DAG);
11622 case 8:
11623 if (NumElements > 2)
11624 return SplitVectorStore(Op, DAG);
11625 return SDValue();
11626 case 16:
11627 if (NumElements > 4 ||
11628 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11629 return SplitVectorStore(Op, DAG);
11630 return SDValue();
11631 default:
11632 llvm_unreachable("unsupported private_element_size");
11633 }
11634 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11635 unsigned Fast = 0;
11636 auto Flags = Store->getMemOperand()->getFlags();
11637 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
11638 Store->getAlign(), Flags, &Fast) &&
11639 Fast > 1)
11640 return SDValue();
11641
11642 if (VT.isVector())
11643 return SplitVectorStore(Op, DAG);
11644
11645 return expandUnalignedStore(Store, DAG);
11646 }
11647
11648 // Probably an invalid store. If so we'll end up emitting a selection error.
11649 return SDValue();
11650 }
11651
11652 // Avoid the full correct expansion for f32 sqrt when promoting from f16.
lowerFSQRTF16(SDValue Op,SelectionDAG & DAG) const11653 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11654 SDLoc SL(Op);
11655 assert(!Subtarget->has16BitInsts());
11656 SDNodeFlags Flags = Op->getFlags();
11657 SDValue Ext =
11658 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11659
11660 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11661 SDValue Sqrt =
11662 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11663
11664 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11665 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11666 }
11667
lowerFSQRTF32(SDValue Op,SelectionDAG & DAG) const11668 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11669 SDLoc DL(Op);
11670 SDNodeFlags Flags = Op->getFlags();
11671 MVT VT = Op.getValueType().getSimpleVT();
11672 const SDValue X = Op.getOperand(0);
11673
11674 if (allowApproxFunc(DAG, Flags)) {
11675 // Instruction is 1ulp but ignores denormals.
11676 return DAG.getNode(
11677 ISD::INTRINSIC_WO_CHAIN, DL, VT,
11678 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11679 }
11680
11681 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11682 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11683
11684 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11685
11686 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11687
11688 SDValue SqrtX =
11689 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11690
11691 SDValue SqrtS;
11692 if (needsDenormHandlingF32(DAG, X, Flags)) {
11693 SDValue SqrtID =
11694 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11695 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11696
11697 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11698 SDValue SqrtSNextDownInt =
11699 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11700 DAG.getAllOnesConstant(DL, MVT::i32));
11701 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11702
11703 SDValue NegSqrtSNextDown =
11704 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11705
11706 SDValue SqrtVP =
11707 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11708
11709 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11710 DAG.getConstant(1, DL, MVT::i32));
11711 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11712
11713 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11714 SDValue SqrtVS =
11715 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11716
11717 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11718 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11719
11720 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11721 Flags);
11722
11723 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11724 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11725 Flags);
11726 } else {
11727 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11728
11729 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11730
11731 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11732 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11733 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11734
11735 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11736 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11737 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11738
11739 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11740 SDValue SqrtD =
11741 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11742 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11743 }
11744
11745 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11746
11747 SDValue ScaledDown =
11748 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11749
11750 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11751 SDValue IsZeroOrInf =
11752 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11753 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11754
11755 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11756 }
11757
lowerFSQRTF64(SDValue Op,SelectionDAG & DAG) const11758 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11759 // For double type, the SQRT and RSQ instructions don't have required
11760 // precision, we apply Goldschmidt's algorithm to improve the result:
11761 //
11762 // y0 = rsq(x)
11763 // g0 = x * y0
11764 // h0 = 0.5 * y0
11765 //
11766 // r0 = 0.5 - h0 * g0
11767 // g1 = g0 * r0 + g0
11768 // h1 = h0 * r0 + h0
11769 //
11770 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11771 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11772 // h2 = h1 * r1 + h1
11773 //
11774 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11775 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11776 //
11777 // sqrt(x) = g3
11778
11779 SDNodeFlags Flags = Op->getFlags();
11780
11781 SDLoc DL(Op);
11782
11783 SDValue X = Op.getOperand(0);
11784 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11785
11786 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11787
11788 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11789
11790 // Scale up input if it is too small.
11791 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11792 SDValue ScaleUp =
11793 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11794 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11795
11796 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11797
11798 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11799
11800 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11801 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11802
11803 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11804 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11805
11806 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11807
11808 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11809
11810 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11811 SDValue SqrtD0 =
11812 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11813
11814 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11815
11816 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11817 SDValue SqrtD1 =
11818 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11819
11820 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11821
11822 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11823 SDValue ScaleDown =
11824 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11825 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11826
11827 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11828 // with finite only or nsz because rsq(+/-0) = +/-inf
11829
11830 // TODO: Check for DAZ and expand to subnormals
11831 SDValue IsZeroOrInf =
11832 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11833 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11834
11835 // If x is +INF, +0, or -0, use its original value
11836 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11837 Flags);
11838 }
11839
LowerTrig(SDValue Op,SelectionDAG & DAG) const11840 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11841 SDLoc DL(Op);
11842 EVT VT = Op.getValueType();
11843 SDValue Arg = Op.getOperand(0);
11844 SDValue TrigVal;
11845
11846 // Propagate fast-math flags so that the multiply we introduce can be folded
11847 // if Arg is already the result of a multiply by constant.
11848 auto Flags = Op->getFlags();
11849
11850 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11851
11852 if (Subtarget->hasTrigReducedRange()) {
11853 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11854 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11855 } else {
11856 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11857 }
11858
11859 switch (Op.getOpcode()) {
11860 case ISD::FCOS:
11861 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11862 case ISD::FSIN:
11863 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11864 default:
11865 llvm_unreachable("Wrong trig opcode");
11866 }
11867 }
11868
LowerATOMIC_CMP_SWAP(SDValue Op,SelectionDAG & DAG) const11869 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11870 SelectionDAG &DAG) const {
11871 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11872 assert(AtomicNode->isCompareAndSwap());
11873 unsigned AS = AtomicNode->getAddressSpace();
11874
11875 // No custom lowering required for local address space
11876 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11877 return Op;
11878
11879 // Non-local address space requires custom lowering for atomic compare
11880 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11881 SDLoc DL(Op);
11882 SDValue ChainIn = Op.getOperand(0);
11883 SDValue Addr = Op.getOperand(1);
11884 SDValue Old = Op.getOperand(2);
11885 SDValue New = Op.getOperand(3);
11886 EVT VT = Op.getValueType();
11887 MVT SimpleVT = VT.getSimpleVT();
11888 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11889
11890 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11891 SDValue Ops[] = {ChainIn, Addr, NewOld};
11892
11893 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
11894 Op->getVTList(), Ops, VT,
11895 AtomicNode->getMemOperand());
11896 }
11897
11898 //===----------------------------------------------------------------------===//
11899 // Custom DAG optimizations
11900 //===----------------------------------------------------------------------===//
11901
11902 SDValue
performUCharToFloatCombine(SDNode * N,DAGCombinerInfo & DCI) const11903 SITargetLowering::performUCharToFloatCombine(SDNode *N,
11904 DAGCombinerInfo &DCI) const {
11905 EVT VT = N->getValueType(0);
11906 EVT ScalarVT = VT.getScalarType();
11907 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11908 return SDValue();
11909
11910 SelectionDAG &DAG = DCI.DAG;
11911 SDLoc DL(N);
11912
11913 SDValue Src = N->getOperand(0);
11914 EVT SrcVT = Src.getValueType();
11915
11916 // TODO: We could try to match extracting the higher bytes, which would be
11917 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11918 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11919 // about in practice.
11920 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11921 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11922 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11923 DCI.AddToWorklist(Cvt.getNode());
11924
11925 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11926 if (ScalarVT != MVT::f32) {
11927 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11928 DAG.getTargetConstant(0, DL, MVT::i32));
11929 }
11930 return Cvt;
11931 }
11932 }
11933
11934 return SDValue();
11935 }
11936
performFCopySignCombine(SDNode * N,DAGCombinerInfo & DCI) const11937 SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11938 DAGCombinerInfo &DCI) const {
11939 SDValue MagnitudeOp = N->getOperand(0);
11940 SDValue SignOp = N->getOperand(1);
11941
11942 // The generic combine for fcopysign + fp cast is too conservative with
11943 // vectors, and also gets confused by the splitting we will perform here, so
11944 // peek through FP casts.
11945 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
11946 SignOp.getOpcode() == ISD::FP_ROUND)
11947 SignOp = SignOp.getOperand(0);
11948
11949 SelectionDAG &DAG = DCI.DAG;
11950 SDLoc DL(N);
11951 EVT SignVT = SignOp.getValueType();
11952
11953 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11954 // lower half with a copy.
11955 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11956 EVT MagVT = MagnitudeOp.getValueType();
11957
11958 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11959
11960 if (MagVT.getScalarType() == MVT::f64) {
11961 EVT F32VT = MagVT.isVector()
11962 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11963 : MVT::v2f32;
11964
11965 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
11966
11967 SmallVector<SDValue, 8> NewElts;
11968 for (unsigned I = 0; I != NumElts; ++I) {
11969 SDValue MagLo =
11970 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11971 DAG.getConstant(2 * I, DL, MVT::i32));
11972 SDValue MagHi =
11973 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11974 DAG.getConstant(2 * I + 1, DL, MVT::i32));
11975
11976 SDValue SignOpElt =
11977 MagVT.isVector()
11978 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),
11979 SignOp, DAG.getConstant(I, DL, MVT::i32))
11980 : SignOp;
11981
11982 SDValue HiOp =
11983 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
11984
11985 SDValue Vector =
11986 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11987
11988 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11989 NewElts.push_back(NewElt);
11990 }
11991
11992 if (NewElts.size() == 1)
11993 return NewElts[0];
11994
11995 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
11996 }
11997
11998 if (SignVT.getScalarType() != MVT::f64)
11999 return SDValue();
12000
12001 // Reduce width of sign operand, we only need the highest bit.
12002 //
12003 // fcopysign f64:x, f64:y ->
12004 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12005 // TODO: In some cases it might make sense to go all the way to f16.
12006
12007 EVT F32VT = MagVT.isVector()
12008 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12009 : MVT::v2f32;
12010
12011 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12012
12013 SmallVector<SDValue, 8> F32Signs;
12014 for (unsigned I = 0; I != NumElts; ++I) {
12015 // Take sign from odd elements of cast vector
12016 SDValue SignAsF32 =
12017 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12018 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12019 F32Signs.push_back(SignAsF32);
12020 }
12021
12022 SDValue NewSign =
12023 NumElts == 1
12024 ? F32Signs.back()
12025 : DAG.getNode(ISD::BUILD_VECTOR, DL,
12026 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12027 F32Signs);
12028
12029 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12030 NewSign);
12031 }
12032
12033 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12034 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12035 // bits
12036
12037 // This is a variant of
12038 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12039 //
12040 // The normal DAG combiner will do this, but only if the add has one use since
12041 // that would increase the number of instructions.
12042 //
12043 // This prevents us from seeing a constant offset that can be folded into a
12044 // memory instruction's addressing mode. If we know the resulting add offset of
12045 // a pointer can be folded into an addressing offset, we can replace the pointer
12046 // operand with the add of new constant offset. This eliminates one of the uses,
12047 // and may allow the remaining use to also be simplified.
12048 //
performSHLPtrCombine(SDNode * N,unsigned AddrSpace,EVT MemVT,DAGCombinerInfo & DCI) const12049 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12050 EVT MemVT,
12051 DAGCombinerInfo &DCI) const {
12052 SDValue N0 = N->getOperand(0);
12053 SDValue N1 = N->getOperand(1);
12054
12055 // We only do this to handle cases where it's profitable when there are
12056 // multiple uses of the add, so defer to the standard combine.
12057 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12058 N0->hasOneUse())
12059 return SDValue();
12060
12061 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12062 if (!CN1)
12063 return SDValue();
12064
12065 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12066 if (!CAdd)
12067 return SDValue();
12068
12069 SelectionDAG &DAG = DCI.DAG;
12070
12071 if (N0->getOpcode() == ISD::OR &&
12072 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12073 return SDValue();
12074
12075 // If the resulting offset is too large, we can't fold it into the
12076 // addressing mode offset.
12077 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12078 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12079
12080 AddrMode AM;
12081 AM.HasBaseReg = true;
12082 AM.BaseOffs = Offset.getSExtValue();
12083 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12084 return SDValue();
12085
12086 SDLoc SL(N);
12087 EVT VT = N->getValueType(0);
12088
12089 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12090 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12091
12092 SDNodeFlags Flags;
12093 Flags.setNoUnsignedWrap(
12094 N->getFlags().hasNoUnsignedWrap() &&
12095 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12096
12097 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12098 }
12099
12100 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12101 /// by the chain and intrinsic ID. Theoretically we would also need to check the
12102 /// specific intrinsic, but they all place the pointer operand first.
getBasePtrIndex(const MemSDNode * N)12103 static unsigned getBasePtrIndex(const MemSDNode *N) {
12104 switch (N->getOpcode()) {
12105 case ISD::STORE:
12106 case ISD::INTRINSIC_W_CHAIN:
12107 case ISD::INTRINSIC_VOID:
12108 return 2;
12109 default:
12110 return 1;
12111 }
12112 }
12113
performMemSDNodeCombine(MemSDNode * N,DAGCombinerInfo & DCI) const12114 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12115 DAGCombinerInfo &DCI) const {
12116 SelectionDAG &DAG = DCI.DAG;
12117
12118 unsigned PtrIdx = getBasePtrIndex(N);
12119 SDValue Ptr = N->getOperand(PtrIdx);
12120
12121 // TODO: We could also do this for multiplies.
12122 if (Ptr.getOpcode() == ISD::SHL) {
12123 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12124 N->getMemoryVT(), DCI);
12125 if (NewPtr) {
12126 SmallVector<SDValue, 8> NewOps(N->ops());
12127
12128 NewOps[PtrIdx] = NewPtr;
12129 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12130 }
12131 }
12132
12133 return SDValue();
12134 }
12135
bitOpWithConstantIsReducible(unsigned Opc,uint32_t Val)12136 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12137 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12138 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12139 (Opc == ISD::XOR && Val == 0);
12140 }
12141
12142 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12143 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12144 // integer combine opportunities since most 64-bit operations are decomposed
12145 // this way. TODO: We won't want this for SALU especially if it is an inline
12146 // immediate.
splitBinaryBitConstantOp(DAGCombinerInfo & DCI,const SDLoc & SL,unsigned Opc,SDValue LHS,const ConstantSDNode * CRHS) const12147 SDValue SITargetLowering::splitBinaryBitConstantOp(
12148 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12149 const ConstantSDNode *CRHS) const {
12150 uint64_t Val = CRHS->getZExtValue();
12151 uint32_t ValLo = Lo_32(Val);
12152 uint32_t ValHi = Hi_32(Val);
12153 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12154
12155 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12156 bitOpWithConstantIsReducible(Opc, ValHi)) ||
12157 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12158 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12159 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12160 !CRHS->user_begin()->isDivergent())
12161 return SDValue();
12162
12163 // If we need to materialize a 64-bit immediate, it will be split up later
12164 // anyway. Avoid creating the harder to understand 64-bit immediate
12165 // materialization.
12166 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12167 }
12168
12169 return SDValue();
12170 }
12171
isBoolSGPR(SDValue V)12172 bool llvm::isBoolSGPR(SDValue V) {
12173 if (V.getValueType() != MVT::i1)
12174 return false;
12175 switch (V.getOpcode()) {
12176 default:
12177 break;
12178 case ISD::SETCC:
12179 case ISD::IS_FPCLASS:
12180 case AMDGPUISD::FP_CLASS:
12181 return true;
12182 case ISD::AND:
12183 case ISD::OR:
12184 case ISD::XOR:
12185 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12186 case ISD::SADDO:
12187 case ISD::UADDO:
12188 case ISD::SSUBO:
12189 case ISD::USUBO:
12190 case ISD::SMULO:
12191 case ISD::UMULO:
12192 return V.getResNo() == 1;
12193 case ISD::INTRINSIC_WO_CHAIN: {
12194 unsigned IntrinsicID = V.getConstantOperandVal(0);
12195 switch (IntrinsicID) {
12196 case Intrinsic::amdgcn_is_shared:
12197 case Intrinsic::amdgcn_is_private:
12198 return true;
12199 default:
12200 return false;
12201 }
12202
12203 return false;
12204 }
12205 }
12206 return false;
12207 }
12208
12209 // If a constant has all zeroes or all ones within each byte return it.
12210 // Otherwise return 0.
getConstantPermuteMask(uint32_t C)12211 static uint32_t getConstantPermuteMask(uint32_t C) {
12212 // 0xff for any zero byte in the mask
12213 uint32_t ZeroByteMask = 0;
12214 if (!(C & 0x000000ff))
12215 ZeroByteMask |= 0x000000ff;
12216 if (!(C & 0x0000ff00))
12217 ZeroByteMask |= 0x0000ff00;
12218 if (!(C & 0x00ff0000))
12219 ZeroByteMask |= 0x00ff0000;
12220 if (!(C & 0xff000000))
12221 ZeroByteMask |= 0xff000000;
12222 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12223 if ((NonZeroByteMask & C) != NonZeroByteMask)
12224 return 0; // Partial bytes selected.
12225 return C;
12226 }
12227
12228 // Check if a node selects whole bytes from its operand 0 starting at a byte
12229 // boundary while masking the rest. Returns select mask as in the v_perm_b32
12230 // or -1 if not succeeded.
12231 // Note byte select encoding:
12232 // value 0-3 selects corresponding source byte;
12233 // value 0xc selects zero;
12234 // value 0xff selects 0xff.
getPermuteMask(SDValue V)12235 static uint32_t getPermuteMask(SDValue V) {
12236 assert(V.getValueSizeInBits() == 32);
12237
12238 if (V.getNumOperands() != 2)
12239 return ~0;
12240
12241 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12242 if (!N1)
12243 return ~0;
12244
12245 uint32_t C = N1->getZExtValue();
12246
12247 switch (V.getOpcode()) {
12248 default:
12249 break;
12250 case ISD::AND:
12251 if (uint32_t ConstMask = getConstantPermuteMask(C))
12252 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12253 break;
12254
12255 case ISD::OR:
12256 if (uint32_t ConstMask = getConstantPermuteMask(C))
12257 return (0x03020100 & ~ConstMask) | ConstMask;
12258 break;
12259
12260 case ISD::SHL:
12261 if (C % 8)
12262 return ~0;
12263
12264 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12265
12266 case ISD::SRL:
12267 if (C % 8)
12268 return ~0;
12269
12270 return uint32_t(0x0c0c0c0c03020100ull >> C);
12271 }
12272
12273 return ~0;
12274 }
12275
performAndCombine(SDNode * N,DAGCombinerInfo & DCI) const12276 SDValue SITargetLowering::performAndCombine(SDNode *N,
12277 DAGCombinerInfo &DCI) const {
12278 if (DCI.isBeforeLegalize())
12279 return SDValue();
12280
12281 SelectionDAG &DAG = DCI.DAG;
12282 EVT VT = N->getValueType(0);
12283 SDValue LHS = N->getOperand(0);
12284 SDValue RHS = N->getOperand(1);
12285
12286 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12287 if (VT == MVT::i64 && CRHS) {
12288 if (SDValue Split =
12289 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12290 return Split;
12291 }
12292
12293 if (CRHS && VT == MVT::i32) {
12294 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12295 // nb = number of trailing zeroes in mask
12296 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12297 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12298 uint64_t Mask = CRHS->getZExtValue();
12299 unsigned Bits = llvm::popcount(Mask);
12300 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12301 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12302 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12303 unsigned Shift = CShift->getZExtValue();
12304 unsigned NB = CRHS->getAPIntValue().countr_zero();
12305 unsigned Offset = NB + Shift;
12306 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12307 SDLoc SL(N);
12308 SDValue BFE =
12309 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12310 DAG.getConstant(Offset, SL, MVT::i32),
12311 DAG.getConstant(Bits, SL, MVT::i32));
12312 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12313 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12314 DAG.getValueType(NarrowVT));
12315 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12316 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12317 return Shl;
12318 }
12319 }
12320 }
12321
12322 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12323 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12324 isa<ConstantSDNode>(LHS.getOperand(2))) {
12325 uint32_t Sel = getConstantPermuteMask(Mask);
12326 if (!Sel)
12327 return SDValue();
12328
12329 // Select 0xc for all zero bytes
12330 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12331 SDLoc DL(N);
12332 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12333 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12334 }
12335 }
12336
12337 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12338 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12339 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12340 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12341 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12342
12343 SDValue X = LHS.getOperand(0);
12344 SDValue Y = RHS.getOperand(0);
12345 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12346 !isTypeLegal(X.getValueType()))
12347 return SDValue();
12348
12349 if (LCC == ISD::SETO) {
12350 if (X != LHS.getOperand(1))
12351 return SDValue();
12352
12353 if (RCC == ISD::SETUNE) {
12354 const ConstantFPSDNode *C1 =
12355 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12356 if (!C1 || !C1->isInfinity() || C1->isNegative())
12357 return SDValue();
12358
12359 const uint32_t Mask = SIInstrFlags::N_NORMAL |
12360 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
12361 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
12362 SIInstrFlags::P_NORMAL;
12363
12364 static_assert(
12365 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
12366 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
12367 0x3ff) == Mask,
12368 "mask not equal");
12369
12370 SDLoc DL(N);
12371 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12372 DAG.getConstant(Mask, DL, MVT::i32));
12373 }
12374 }
12375 }
12376
12377 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12378 std::swap(LHS, RHS);
12379
12380 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12381 RHS.hasOneUse()) {
12382 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12383 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12384 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12385 // | n_nan)
12386 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12387 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12388 (RHS.getOperand(0) == LHS.getOperand(0) &&
12389 LHS.getOperand(0) == LHS.getOperand(1))) {
12390 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12391 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12392 : Mask->getZExtValue() & OrdMask;
12393
12394 SDLoc DL(N);
12395 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12396 DAG.getConstant(NewMask, DL, MVT::i32));
12397 }
12398 }
12399
12400 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12401 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12402 // and x, (sext cc from i1) => select cc, x, 0
12403 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12404 std::swap(LHS, RHS);
12405 if (isBoolSGPR(RHS.getOperand(0)))
12406 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12407 DAG.getConstant(0, SDLoc(N), MVT::i32));
12408 }
12409
12410 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12411 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12412 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12413 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12414 uint32_t LHSMask = getPermuteMask(LHS);
12415 uint32_t RHSMask = getPermuteMask(RHS);
12416 if (LHSMask != ~0u && RHSMask != ~0u) {
12417 // Canonicalize the expression in an attempt to have fewer unique masks
12418 // and therefore fewer registers used to hold the masks.
12419 if (LHSMask > RHSMask) {
12420 std::swap(LHSMask, RHSMask);
12421 std::swap(LHS, RHS);
12422 }
12423
12424 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12425 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12426 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12427 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12428
12429 // Check of we need to combine values from two sources within a byte.
12430 if (!(LHSUsedLanes & RHSUsedLanes) &&
12431 // If we select high and lower word keep it for SDWA.
12432 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12433 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12434 // Each byte in each mask is either selector mask 0-3, or has higher
12435 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12436 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12437 // mask which is not 0xff wins. By anding both masks we have a correct
12438 // result except that 0x0c shall be corrected to give 0x0c only.
12439 uint32_t Mask = LHSMask & RHSMask;
12440 for (unsigned I = 0; I < 32; I += 8) {
12441 uint32_t ByteSel = 0xff << I;
12442 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12443 Mask &= (0x0c << I) & 0xffffffff;
12444 }
12445
12446 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12447 // or 0x0c.
12448 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12449 SDLoc DL(N);
12450
12451 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12452 RHS.getOperand(0),
12453 DAG.getConstant(Sel, DL, MVT::i32));
12454 }
12455 }
12456 }
12457
12458 return SDValue();
12459 }
12460
12461 // A key component of v_perm is a mapping between byte position of the src
12462 // operands, and the byte position of the dest. To provide such, we need: 1. the
12463 // node that provides x byte of the dest of the OR, and 2. the byte of the node
12464 // used to provide that x byte. calculateByteProvider finds which node provides
12465 // a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12466 // and finds an ultimate src and byte position For example: The supported
12467 // LoadCombine pattern for vector loads is as follows
12468 // t1
12469 // or
12470 // / \
12471 // t2 t3
12472 // zext shl
12473 // | | \
12474 // t4 t5 16
12475 // or anyext
12476 // / \ |
12477 // t6 t7 t8
12478 // srl shl or
12479 // / | / \ / \
12480 // t9 t10 t11 t12 t13 t14
12481 // trunc* 8 trunc* 8 and and
12482 // | | / | | \
12483 // t15 t16 t17 t18 t19 t20
12484 // trunc* 255 srl -256
12485 // | / \
12486 // t15 t15 16
12487 //
12488 // *In this example, the truncs are from i32->i16
12489 //
12490 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12491 // respectively. calculateSrcByte would find (given node) -> ultimate src &
12492 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12493 // After finding the mapping, we can combine the tree into vperm t15, t16,
12494 // 0x05000407
12495
12496 // Find the source and byte position from a node.
12497 // \p DestByte is the byte position of the dest of the or that the src
12498 // ultimately provides. \p SrcIndex is the byte of the src that maps to this
12499 // dest of the or byte. \p Depth tracks how many recursive iterations we have
12500 // performed.
12501 static const std::optional<ByteProvider<SDValue>>
calculateSrcByte(const SDValue Op,uint64_t DestByte,uint64_t SrcIndex=0,unsigned Depth=0)12502 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12503 unsigned Depth = 0) {
12504 // We may need to recursively traverse a series of SRLs
12505 if (Depth >= 6)
12506 return std::nullopt;
12507
12508 if (Op.getValueSizeInBits() < 8)
12509 return std::nullopt;
12510
12511 if (Op.getValueType().isVector())
12512 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12513
12514 switch (Op->getOpcode()) {
12515 case ISD::TRUNCATE: {
12516 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12517 }
12518
12519 case ISD::SIGN_EXTEND:
12520 case ISD::ZERO_EXTEND:
12521 case ISD::SIGN_EXTEND_INREG: {
12522 SDValue NarrowOp = Op->getOperand(0);
12523 auto NarrowVT = NarrowOp.getValueType();
12524 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12525 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12526 NarrowVT = VTSign->getVT();
12527 }
12528 if (!NarrowVT.isByteSized())
12529 return std::nullopt;
12530 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12531
12532 if (SrcIndex >= NarrowByteWidth)
12533 return std::nullopt;
12534 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12535 }
12536
12537 case ISD::SRA:
12538 case ISD::SRL: {
12539 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12540 if (!ShiftOp)
12541 return std::nullopt;
12542
12543 uint64_t BitShift = ShiftOp->getZExtValue();
12544
12545 if (BitShift % 8 != 0)
12546 return std::nullopt;
12547
12548 SrcIndex += BitShift / 8;
12549
12550 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12551 }
12552
12553 default: {
12554 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12555 }
12556 }
12557 llvm_unreachable("fully handled switch");
12558 }
12559
12560 // For a byte position in the result of an Or, traverse the tree and find the
12561 // node (and the byte of the node) which ultimately provides this {Or,
12562 // BytePosition}. \p Op is the operand we are currently examining. \p Index is
12563 // the byte position of the Op that corresponds with the originally requested
12564 // byte of the Or \p Depth tracks how many recursive iterations we have
12565 // performed. \p StartingIndex is the originally requested byte of the Or
12566 static const std::optional<ByteProvider<SDValue>>
calculateByteProvider(const SDValue & Op,unsigned Index,unsigned Depth,unsigned StartingIndex=0)12567 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12568 unsigned StartingIndex = 0) {
12569 // Finding Src tree of RHS of or typically requires at least 1 additional
12570 // depth
12571 if (Depth > 6)
12572 return std::nullopt;
12573
12574 unsigned BitWidth = Op.getScalarValueSizeInBits();
12575 if (BitWidth % 8 != 0)
12576 return std::nullopt;
12577 if (Index > BitWidth / 8 - 1)
12578 return std::nullopt;
12579
12580 bool IsVec = Op.getValueType().isVector();
12581 switch (Op.getOpcode()) {
12582 case ISD::OR: {
12583 if (IsVec)
12584 return std::nullopt;
12585
12586 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12587 StartingIndex);
12588 if (!RHS)
12589 return std::nullopt;
12590 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12591 StartingIndex);
12592 if (!LHS)
12593 return std::nullopt;
12594 // A well formed Or will have two ByteProviders for each byte, one of which
12595 // is constant zero
12596 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12597 return std::nullopt;
12598 if (!LHS || LHS->isConstantZero())
12599 return RHS;
12600 if (!RHS || RHS->isConstantZero())
12601 return LHS;
12602 return std::nullopt;
12603 }
12604
12605 case ISD::AND: {
12606 if (IsVec)
12607 return std::nullopt;
12608
12609 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12610 if (!BitMaskOp)
12611 return std::nullopt;
12612
12613 uint32_t BitMask = BitMaskOp->getZExtValue();
12614 // Bits we expect for our StartingIndex
12615 uint32_t IndexMask = 0xFF << (Index * 8);
12616
12617 if ((IndexMask & BitMask) != IndexMask) {
12618 // If the result of the and partially provides the byte, then it
12619 // is not well formatted
12620 if (IndexMask & BitMask)
12621 return std::nullopt;
12622 return ByteProvider<SDValue>::getConstantZero();
12623 }
12624
12625 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12626 }
12627
12628 case ISD::FSHR: {
12629 if (IsVec)
12630 return std::nullopt;
12631
12632 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12633 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12634 if (!ShiftOp || Op.getValueType().isVector())
12635 return std::nullopt;
12636
12637 uint64_t BitsProvided = Op.getValueSizeInBits();
12638 if (BitsProvided % 8 != 0)
12639 return std::nullopt;
12640
12641 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12642 if (BitShift % 8)
12643 return std::nullopt;
12644
12645 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12646 uint64_t ByteShift = BitShift / 8;
12647
12648 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12649 uint64_t BytesProvided = BitsProvided / 8;
12650 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12651 NewIndex %= BytesProvided;
12652 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12653 }
12654
12655 case ISD::SRA:
12656 case ISD::SRL: {
12657 if (IsVec)
12658 return std::nullopt;
12659
12660 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12661 if (!ShiftOp)
12662 return std::nullopt;
12663
12664 uint64_t BitShift = ShiftOp->getZExtValue();
12665 if (BitShift % 8)
12666 return std::nullopt;
12667
12668 auto BitsProvided = Op.getScalarValueSizeInBits();
12669 if (BitsProvided % 8 != 0)
12670 return std::nullopt;
12671
12672 uint64_t BytesProvided = BitsProvided / 8;
12673 uint64_t ByteShift = BitShift / 8;
12674 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12675 // If the byte we are trying to provide (as tracked by index) falls in this
12676 // range, then the SRL provides the byte. The byte of interest of the src of
12677 // the SRL is Index + ByteShift
12678 return BytesProvided - ByteShift > Index
12679 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12680 Index + ByteShift)
12681 : ByteProvider<SDValue>::getConstantZero();
12682 }
12683
12684 case ISD::SHL: {
12685 if (IsVec)
12686 return std::nullopt;
12687
12688 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12689 if (!ShiftOp)
12690 return std::nullopt;
12691
12692 uint64_t BitShift = ShiftOp->getZExtValue();
12693 if (BitShift % 8 != 0)
12694 return std::nullopt;
12695 uint64_t ByteShift = BitShift / 8;
12696
12697 // If we are shifting by an amount greater than (or equal to)
12698 // the index we are trying to provide, then it provides 0s. If not,
12699 // then this bytes are not definitively 0s, and the corresponding byte
12700 // of interest is Index - ByteShift of the src
12701 return Index < ByteShift
12702 ? ByteProvider<SDValue>::getConstantZero()
12703 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12704 Depth + 1, StartingIndex);
12705 }
12706 case ISD::ANY_EXTEND:
12707 case ISD::SIGN_EXTEND:
12708 case ISD::ZERO_EXTEND:
12709 case ISD::SIGN_EXTEND_INREG:
12710 case ISD::AssertZext:
12711 case ISD::AssertSext: {
12712 if (IsVec)
12713 return std::nullopt;
12714
12715 SDValue NarrowOp = Op->getOperand(0);
12716 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12717 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12718 Op->getOpcode() == ISD::AssertZext ||
12719 Op->getOpcode() == ISD::AssertSext) {
12720 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12721 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12722 }
12723 if (NarrowBitWidth % 8 != 0)
12724 return std::nullopt;
12725 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12726
12727 if (Index >= NarrowByteWidth)
12728 return Op.getOpcode() == ISD::ZERO_EXTEND
12729 ? std::optional<ByteProvider<SDValue>>(
12730 ByteProvider<SDValue>::getConstantZero())
12731 : std::nullopt;
12732 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12733 }
12734
12735 case ISD::TRUNCATE: {
12736 if (IsVec)
12737 return std::nullopt;
12738
12739 uint64_t NarrowByteWidth = BitWidth / 8;
12740
12741 if (NarrowByteWidth >= Index) {
12742 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12743 StartingIndex);
12744 }
12745
12746 return std::nullopt;
12747 }
12748
12749 case ISD::CopyFromReg: {
12750 if (BitWidth / 8 > Index)
12751 return calculateSrcByte(Op, StartingIndex, Index);
12752
12753 return std::nullopt;
12754 }
12755
12756 case ISD::LOAD: {
12757 auto *L = cast<LoadSDNode>(Op.getNode());
12758
12759 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12760 if (NarrowBitWidth % 8 != 0)
12761 return std::nullopt;
12762 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12763
12764 // If the width of the load does not reach byte we are trying to provide for
12765 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12766 // question
12767 if (Index >= NarrowByteWidth) {
12768 return L->getExtensionType() == ISD::ZEXTLOAD
12769 ? std::optional<ByteProvider<SDValue>>(
12770 ByteProvider<SDValue>::getConstantZero())
12771 : std::nullopt;
12772 }
12773
12774 if (NarrowByteWidth > Index) {
12775 return calculateSrcByte(Op, StartingIndex, Index);
12776 }
12777
12778 return std::nullopt;
12779 }
12780
12781 case ISD::BSWAP: {
12782 if (IsVec)
12783 return std::nullopt;
12784
12785 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12786 Depth + 1, StartingIndex);
12787 }
12788
12789 case ISD::EXTRACT_VECTOR_ELT: {
12790 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12791 if (!IdxOp)
12792 return std::nullopt;
12793 auto VecIdx = IdxOp->getZExtValue();
12794 auto ScalarSize = Op.getScalarValueSizeInBits();
12795 if (ScalarSize < 32)
12796 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12797 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12798 StartingIndex, Index);
12799 }
12800
12801 case AMDGPUISD::PERM: {
12802 if (IsVec)
12803 return std::nullopt;
12804
12805 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12806 if (!PermMask)
12807 return std::nullopt;
12808
12809 auto IdxMask =
12810 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12811 if (IdxMask > 0x07 && IdxMask != 0x0c)
12812 return std::nullopt;
12813
12814 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12815 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12816
12817 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12818 : ByteProvider<SDValue>(
12819 ByteProvider<SDValue>::getConstantZero());
12820 }
12821
12822 default: {
12823 return std::nullopt;
12824 }
12825 }
12826
12827 llvm_unreachable("fully handled switch");
12828 }
12829
12830 // Returns true if the Operand is a scalar and is 16 bits
isExtendedFrom16Bits(SDValue & Operand)12831 static bool isExtendedFrom16Bits(SDValue &Operand) {
12832
12833 switch (Operand.getOpcode()) {
12834 case ISD::ANY_EXTEND:
12835 case ISD::SIGN_EXTEND:
12836 case ISD::ZERO_EXTEND: {
12837 auto OpVT = Operand.getOperand(0).getValueType();
12838 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12839 }
12840 case ISD::LOAD: {
12841 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12842 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12843 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12844 ExtType == ISD::EXTLOAD) {
12845 auto MemVT = L->getMemoryVT();
12846 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12847 }
12848 return L->getMemoryVT().getSizeInBits() == 16;
12849 }
12850 default:
12851 return false;
12852 }
12853 }
12854
12855 // Returns true if the mask matches consecutive bytes, and the first byte
12856 // begins at a power of 2 byte offset from 0th byte
addresses16Bits(int Mask)12857 static bool addresses16Bits(int Mask) {
12858 int Low8 = Mask & 0xff;
12859 int Hi8 = (Mask & 0xff00) >> 8;
12860
12861 assert(Low8 < 8 && Hi8 < 8);
12862 // Are the bytes contiguous in the order of increasing addresses.
12863 bool IsConsecutive = (Hi8 - Low8 == 1);
12864 // Is the first byte at location that is aligned for 16 bit instructions.
12865 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12866 // In this case, we still need code to extract the 16 bit operand, so it
12867 // is better to use i8 v_perm
12868 bool Is16Aligned = !(Low8 % 2);
12869
12870 return IsConsecutive && Is16Aligned;
12871 }
12872
12873 // Do not lower into v_perm if the operands are actually 16 bit
12874 // and the selected bits (based on PermMask) correspond with two
12875 // easily addressable 16 bit operands.
hasNon16BitAccesses(uint64_t PermMask,SDValue & Op,SDValue & OtherOp)12876 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12877 SDValue &OtherOp) {
12878 int Low16 = PermMask & 0xffff;
12879 int Hi16 = (PermMask & 0xffff0000) >> 16;
12880
12881 auto TempOp = peekThroughBitcasts(Op);
12882 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12883
12884 auto OpIs16Bit =
12885 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12886 if (!OpIs16Bit)
12887 return true;
12888
12889 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12890 isExtendedFrom16Bits(TempOtherOp);
12891 if (!OtherOpIs16Bit)
12892 return true;
12893
12894 // Do we cleanly address both
12895 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12896 }
12897
getDWordFromOffset(SelectionDAG & DAG,SDLoc SL,SDValue Src,unsigned DWordOffset)12898 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12899 unsigned DWordOffset) {
12900 SDValue Ret;
12901
12902 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12903 // ByteProvider must be at least 8 bits
12904 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12905
12906 if (TypeSize <= 32)
12907 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12908
12909 if (Src.getValueType().isVector()) {
12910 auto ScalarTySize = Src.getScalarValueSizeInBits();
12911 auto ScalarTy = Src.getValueType().getScalarType();
12912 if (ScalarTySize == 32) {
12913 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12914 DAG.getConstant(DWordOffset, SL, MVT::i32));
12915 }
12916 if (ScalarTySize > 32) {
12917 Ret = DAG.getNode(
12918 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12919 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12920 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12921 if (ShiftVal)
12922 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12923 DAG.getConstant(ShiftVal, SL, MVT::i32));
12924 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12925 }
12926
12927 assert(ScalarTySize < 32);
12928 auto NumElements = TypeSize / ScalarTySize;
12929 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12930 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12931 auto NumElementsIn32 = 32 / ScalarTySize;
12932 auto NumAvailElements = DWordOffset < Trunc32Elements
12933 ? NumElementsIn32
12934 : NumElements - NormalizedTrunc;
12935
12936 SmallVector<SDValue, 4> VecSrcs;
12937 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12938 NumAvailElements);
12939
12940 Ret = DAG.getBuildVector(
12941 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12942 VecSrcs);
12943 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12944 }
12945
12946 /// Scalar Type
12947 auto ShiftVal = 32 * DWordOffset;
12948 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12949 DAG.getConstant(ShiftVal, SL, MVT::i32));
12950 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12951 }
12952
matchPERM(SDNode * N,TargetLowering::DAGCombinerInfo & DCI)12953 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12954 SelectionDAG &DAG = DCI.DAG;
12955 [[maybe_unused]] EVT VT = N->getValueType(0);
12956 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12957
12958 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12959 assert(VT == MVT::i32);
12960 for (int i = 0; i < 4; i++) {
12961 // Find the ByteProvider that provides the ith byte of the result of OR
12962 std::optional<ByteProvider<SDValue>> P =
12963 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12964 // TODO support constantZero
12965 if (!P || P->isConstantZero())
12966 return SDValue();
12967
12968 PermNodes.push_back(*P);
12969 }
12970 if (PermNodes.size() != 4)
12971 return SDValue();
12972
12973 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12974 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12975 uint64_t PermMask = 0x00000000;
12976 for (size_t i = 0; i < PermNodes.size(); i++) {
12977 auto PermOp = PermNodes[i];
12978 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12979 // by sizeof(Src2) = 4
12980 int SrcByteAdjust = 4;
12981
12982 // If the Src uses a byte from a different DWORD, then it corresponds
12983 // with a difference source
12984 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12985 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12986 if (SecondSrc)
12987 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12988 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12989 return SDValue();
12990
12991 // Set the index of the second distinct Src node
12992 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12993 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12994 SrcByteAdjust = 0;
12995 }
12996 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12997 assert(!DAG.getDataLayout().isBigEndian());
12998 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12999 }
13000 SDLoc DL(N);
13001 SDValue Op = *PermNodes[FirstSrc.first].Src;
13002 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13003 assert(Op.getValueSizeInBits() == 32);
13004
13005 // Check that we are not just extracting the bytes in order from an op
13006 if (!SecondSrc) {
13007 int Low16 = PermMask & 0xffff;
13008 int Hi16 = (PermMask & 0xffff0000) >> 16;
13009
13010 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13011 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13012
13013 // The perm op would really just produce Op. So combine into Op
13014 if (WellFormedLow && WellFormedHi)
13015 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13016 }
13017
13018 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13019
13020 if (SecondSrc) {
13021 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13022 assert(OtherOp.getValueSizeInBits() == 32);
13023 }
13024
13025 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13026
13027 assert(Op.getValueType().isByteSized() &&
13028 OtherOp.getValueType().isByteSized());
13029
13030 // If the ultimate src is less than 32 bits, then we will only be
13031 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13032 // CalculateByteProvider would not have returned Op as source if we
13033 // used a byte that is outside its ValueType. Thus, we are free to
13034 // ANY_EXTEND as the extended bits are dont-cares.
13035 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13036 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13037
13038 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13039 DAG.getConstant(PermMask, DL, MVT::i32));
13040 }
13041 return SDValue();
13042 }
13043
performOrCombine(SDNode * N,DAGCombinerInfo & DCI) const13044 SDValue SITargetLowering::performOrCombine(SDNode *N,
13045 DAGCombinerInfo &DCI) const {
13046 SelectionDAG &DAG = DCI.DAG;
13047 SDValue LHS = N->getOperand(0);
13048 SDValue RHS = N->getOperand(1);
13049
13050 EVT VT = N->getValueType(0);
13051 if (VT == MVT::i1) {
13052 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13053 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13054 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13055 SDValue Src = LHS.getOperand(0);
13056 if (Src != RHS.getOperand(0))
13057 return SDValue();
13058
13059 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13060 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13061 if (!CLHS || !CRHS)
13062 return SDValue();
13063
13064 // Only 10 bits are used.
13065 static const uint32_t MaxMask = 0x3ff;
13066
13067 uint32_t NewMask =
13068 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13069 SDLoc DL(N);
13070 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13071 DAG.getConstant(NewMask, DL, MVT::i32));
13072 }
13073
13074 return SDValue();
13075 }
13076
13077 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13078 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
13079 LHS.getOpcode() == AMDGPUISD::PERM &&
13080 isa<ConstantSDNode>(LHS.getOperand(2))) {
13081 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13082 if (!Sel)
13083 return SDValue();
13084
13085 Sel |= LHS.getConstantOperandVal(2);
13086 SDLoc DL(N);
13087 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13088 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13089 }
13090
13091 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13092 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13093 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13094 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13095
13096 // If all the uses of an or need to extract the individual elements, do not
13097 // attempt to lower into v_perm
13098 auto usesCombinedOperand = [](SDNode *OrUse) {
13099 // If we have any non-vectorized use, then it is a candidate for v_perm
13100 if (OrUse->getOpcode() != ISD::BITCAST ||
13101 !OrUse->getValueType(0).isVector())
13102 return true;
13103
13104 // If we have any non-vectorized use, then it is a candidate for v_perm
13105 for (auto *VUser : OrUse->users()) {
13106 if (!VUser->getValueType(0).isVector())
13107 return true;
13108
13109 // If the use of a vector is a store, then combining via a v_perm
13110 // is beneficial.
13111 // TODO -- whitelist more uses
13112 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13113 if (VUser->getOpcode() == VectorwiseOp)
13114 return true;
13115 }
13116 return false;
13117 };
13118
13119 if (!any_of(N->users(), usesCombinedOperand))
13120 return SDValue();
13121
13122 uint32_t LHSMask = getPermuteMask(LHS);
13123 uint32_t RHSMask = getPermuteMask(RHS);
13124
13125 if (LHSMask != ~0u && RHSMask != ~0u) {
13126 // Canonicalize the expression in an attempt to have fewer unique masks
13127 // and therefore fewer registers used to hold the masks.
13128 if (LHSMask > RHSMask) {
13129 std::swap(LHSMask, RHSMask);
13130 std::swap(LHS, RHS);
13131 }
13132
13133 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13134 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13135 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13136 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13137
13138 // Check of we need to combine values from two sources within a byte.
13139 if (!(LHSUsedLanes & RHSUsedLanes) &&
13140 // If we select high and lower word keep it for SDWA.
13141 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13142 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13143 // Kill zero bytes selected by other mask. Zero value is 0xc.
13144 LHSMask &= ~RHSUsedLanes;
13145 RHSMask &= ~LHSUsedLanes;
13146 // Add 4 to each active LHS lane
13147 LHSMask |= LHSUsedLanes & 0x04040404;
13148 // Combine masks
13149 uint32_t Sel = LHSMask | RHSMask;
13150 SDLoc DL(N);
13151
13152 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13153 RHS.getOperand(0),
13154 DAG.getConstant(Sel, DL, MVT::i32));
13155 }
13156 }
13157 if (LHSMask == ~0u || RHSMask == ~0u) {
13158 if (SDValue Perm = matchPERM(N, DCI))
13159 return Perm;
13160 }
13161 }
13162
13163 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13164 return SDValue();
13165
13166 // TODO: This could be a generic combine with a predicate for extracting the
13167 // high half of an integer being free.
13168
13169 // (or i64:x, (zero_extend i32:y)) ->
13170 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13171 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13172 RHS.getOpcode() != ISD::ZERO_EXTEND)
13173 std::swap(LHS, RHS);
13174
13175 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13176 SDValue ExtSrc = RHS.getOperand(0);
13177 EVT SrcVT = ExtSrc.getValueType();
13178 if (SrcVT == MVT::i32) {
13179 SDLoc SL(N);
13180 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13181 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13182
13183 DCI.AddToWorklist(LowOr.getNode());
13184 DCI.AddToWorklist(HiBits.getNode());
13185
13186 SDValue Vec =
13187 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13188 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13189 }
13190 }
13191
13192 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13193 if (CRHS) {
13194 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13195 N->getOperand(0), CRHS))
13196 return Split;
13197 }
13198
13199 return SDValue();
13200 }
13201
performXorCombine(SDNode * N,DAGCombinerInfo & DCI) const13202 SDValue SITargetLowering::performXorCombine(SDNode *N,
13203 DAGCombinerInfo &DCI) const {
13204 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13205 return RV;
13206
13207 SDValue LHS = N->getOperand(0);
13208 SDValue RHS = N->getOperand(1);
13209
13210 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13211 SelectionDAG &DAG = DCI.DAG;
13212
13213 EVT VT = N->getValueType(0);
13214 if (CRHS && VT == MVT::i64) {
13215 if (SDValue Split =
13216 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13217 return Split;
13218 }
13219
13220 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13221 // fneg-like xors into 64-bit select.
13222 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13223 // This looks like an fneg, try to fold as a source modifier.
13224 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13225 shouldFoldFNegIntoSrc(N, LHS)) {
13226 // xor (select c, a, b), 0x80000000 ->
13227 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13228 SDLoc DL(N);
13229 SDValue CastLHS =
13230 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13231 SDValue CastRHS =
13232 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13233 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13234 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13235 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13236 LHS->getOperand(0), FNegLHS, FNegRHS);
13237 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13238 }
13239 }
13240
13241 return SDValue();
13242 }
13243
performZeroExtendCombine(SDNode * N,DAGCombinerInfo & DCI) const13244 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13245 DAGCombinerInfo &DCI) const {
13246 if (!Subtarget->has16BitInsts() ||
13247 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13248 return SDValue();
13249
13250 EVT VT = N->getValueType(0);
13251 if (VT != MVT::i32)
13252 return SDValue();
13253
13254 SDValue Src = N->getOperand(0);
13255 if (Src.getValueType() != MVT::i16)
13256 return SDValue();
13257
13258 return SDValue();
13259 }
13260
13261 SDValue
performSignExtendInRegCombine(SDNode * N,DAGCombinerInfo & DCI) const13262 SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13263 DAGCombinerInfo &DCI) const {
13264 SDValue Src = N->getOperand(0);
13265 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13266
13267 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13268 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13269 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13270 VTSign->getVT() == MVT::i8) ||
13271 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13272 VTSign->getVT() == MVT::i16))) {
13273 assert(Subtarget->hasScalarSubwordLoads() &&
13274 "s_buffer_load_{u8, i8} are supported "
13275 "in GFX12 (or newer) architectures.");
13276 EVT VT = Src.getValueType();
13277 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13278 ? AMDGPUISD::SBUFFER_LOAD_BYTE
13279 : AMDGPUISD::SBUFFER_LOAD_SHORT;
13280 SDLoc DL(N);
13281 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13282 SDValue Ops[] = {
13283 Src.getOperand(0), // source register
13284 Src.getOperand(1), // offset
13285 Src.getOperand(2) // cachePolicy
13286 };
13287 auto *M = cast<MemSDNode>(Src);
13288 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13289 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13290 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13291 return LoadVal;
13292 }
13293 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13294 VTSign->getVT() == MVT::i8) ||
13295 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13296 VTSign->getVT() == MVT::i16)) &&
13297 Src.hasOneUse()) {
13298 auto *M = cast<MemSDNode>(Src);
13299 SDValue Ops[] = {Src.getOperand(0), // Chain
13300 Src.getOperand(1), // rsrc
13301 Src.getOperand(2), // vindex
13302 Src.getOperand(3), // voffset
13303 Src.getOperand(4), // soffset
13304 Src.getOperand(5), // offset
13305 Src.getOperand(6), Src.getOperand(7)};
13306 // replace with BUFFER_LOAD_BYTE/SHORT
13307 SDVTList ResList =
13308 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13309 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13310 ? AMDGPUISD::BUFFER_LOAD_BYTE
13311 : AMDGPUISD::BUFFER_LOAD_SHORT;
13312 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13313 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13314 return DCI.DAG.getMergeValues(
13315 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13316 }
13317 return SDValue();
13318 }
13319
performClassCombine(SDNode * N,DAGCombinerInfo & DCI) const13320 SDValue SITargetLowering::performClassCombine(SDNode *N,
13321 DAGCombinerInfo &DCI) const {
13322 SelectionDAG &DAG = DCI.DAG;
13323 SDValue Mask = N->getOperand(1);
13324
13325 // fp_class x, 0 -> false
13326 if (isNullConstant(Mask))
13327 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13328
13329 if (N->getOperand(0).isUndef())
13330 return DAG.getUNDEF(MVT::i1);
13331
13332 return SDValue();
13333 }
13334
performRcpCombine(SDNode * N,DAGCombinerInfo & DCI) const13335 SDValue SITargetLowering::performRcpCombine(SDNode *N,
13336 DAGCombinerInfo &DCI) const {
13337 EVT VT = N->getValueType(0);
13338 SDValue N0 = N->getOperand(0);
13339
13340 if (N0.isUndef()) {
13341 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13342 SDLoc(N), VT);
13343 }
13344
13345 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13346 N0.getOpcode() == ISD::SINT_TO_FP)) {
13347 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13348 N->getFlags());
13349 }
13350
13351 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13352 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13353 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13354 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13355 N->getFlags());
13356 }
13357
13358 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
13359 }
13360
isCanonicalized(SelectionDAG & DAG,SDValue Op,unsigned MaxDepth) const13361 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
13362 unsigned MaxDepth) const {
13363 unsigned Opcode = Op.getOpcode();
13364 if (Opcode == ISD::FCANONICALIZE)
13365 return true;
13366
13367 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13368 const auto &F = CFP->getValueAPF();
13369 if (F.isNaN() && F.isSignaling())
13370 return false;
13371 if (!F.isDenormal())
13372 return true;
13373
13374 DenormalMode Mode =
13375 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13376 return Mode == DenormalMode::getIEEE();
13377 }
13378
13379 // If source is a result of another standard FP operation it is already in
13380 // canonical form.
13381 if (MaxDepth == 0)
13382 return false;
13383
13384 switch (Opcode) {
13385 // These will flush denorms if required.
13386 case ISD::FADD:
13387 case ISD::FSUB:
13388 case ISD::FMUL:
13389 case ISD::FCEIL:
13390 case ISD::FFLOOR:
13391 case ISD::FMA:
13392 case ISD::FMAD:
13393 case ISD::FSQRT:
13394 case ISD::FDIV:
13395 case ISD::FREM:
13396 case ISD::FP_ROUND:
13397 case ISD::FP_EXTEND:
13398 case ISD::FP16_TO_FP:
13399 case ISD::FP_TO_FP16:
13400 case ISD::BF16_TO_FP:
13401 case ISD::FP_TO_BF16:
13402 case ISD::FLDEXP:
13403 case AMDGPUISD::FMUL_LEGACY:
13404 case AMDGPUISD::FMAD_FTZ:
13405 case AMDGPUISD::RCP:
13406 case AMDGPUISD::RSQ:
13407 case AMDGPUISD::RSQ_CLAMP:
13408 case AMDGPUISD::RCP_LEGACY:
13409 case AMDGPUISD::RCP_IFLAG:
13410 case AMDGPUISD::LOG:
13411 case AMDGPUISD::EXP:
13412 case AMDGPUISD::DIV_SCALE:
13413 case AMDGPUISD::DIV_FMAS:
13414 case AMDGPUISD::DIV_FIXUP:
13415 case AMDGPUISD::FRACT:
13416 case AMDGPUISD::CVT_PKRTZ_F16_F32:
13417 case AMDGPUISD::CVT_F32_UBYTE0:
13418 case AMDGPUISD::CVT_F32_UBYTE1:
13419 case AMDGPUISD::CVT_F32_UBYTE2:
13420 case AMDGPUISD::CVT_F32_UBYTE3:
13421 case AMDGPUISD::FP_TO_FP16:
13422 case AMDGPUISD::SIN_HW:
13423 case AMDGPUISD::COS_HW:
13424 return true;
13425
13426 // It can/will be lowered or combined as a bit operation.
13427 // Need to check their input recursively to handle.
13428 case ISD::FNEG:
13429 case ISD::FABS:
13430 case ISD::FCOPYSIGN:
13431 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13432
13433 case ISD::AND:
13434 if (Op.getValueType() == MVT::i32) {
13435 // Be careful as we only know it is a bitcast floating point type. It
13436 // could be f32, v2f16, we have no way of knowing. Luckily the constant
13437 // value that we optimize for, which comes up in fp32 to bf16 conversions,
13438 // is valid to optimize for all types.
13439 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13440 if (RHS->getZExtValue() == 0xffff0000) {
13441 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13442 }
13443 }
13444 }
13445 break;
13446
13447 case ISD::FSIN:
13448 case ISD::FCOS:
13449 case ISD::FSINCOS:
13450 return Op.getValueType().getScalarType() != MVT::f16;
13451
13452 case ISD::FMINNUM:
13453 case ISD::FMAXNUM:
13454 case ISD::FMINNUM_IEEE:
13455 case ISD::FMAXNUM_IEEE:
13456 case ISD::FMINIMUM:
13457 case ISD::FMAXIMUM:
13458 case ISD::FMINIMUMNUM:
13459 case ISD::FMAXIMUMNUM:
13460 case AMDGPUISD::CLAMP:
13461 case AMDGPUISD::FMED3:
13462 case AMDGPUISD::FMAX3:
13463 case AMDGPUISD::FMIN3:
13464 case AMDGPUISD::FMAXIMUM3:
13465 case AMDGPUISD::FMINIMUM3: {
13466 // FIXME: Shouldn't treat the generic operations different based these.
13467 // However, we aren't really required to flush the result from
13468 // minnum/maxnum..
13469
13470 // snans will be quieted, so we only need to worry about denormals.
13471 if (Subtarget->supportsMinMaxDenormModes() ||
13472 // FIXME: denormalsEnabledForType is broken for dynamic
13473 denormalsEnabledForType(DAG, Op.getValueType()))
13474 return true;
13475
13476 // Flushing may be required.
13477 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13478 // targets need to check their input recursively.
13479
13480 // FIXME: Does this apply with clamp? It's implemented with max.
13481 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13482 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13483 return false;
13484 }
13485
13486 return true;
13487 }
13488 case ISD::SELECT: {
13489 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13490 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13491 }
13492 case ISD::BUILD_VECTOR: {
13493 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13494 SDValue SrcOp = Op.getOperand(i);
13495 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13496 return false;
13497 }
13498
13499 return true;
13500 }
13501 case ISD::EXTRACT_VECTOR_ELT:
13502 case ISD::EXTRACT_SUBVECTOR: {
13503 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13504 }
13505 case ISD::INSERT_VECTOR_ELT: {
13506 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13507 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13508 }
13509 case ISD::UNDEF:
13510 // Could be anything.
13511 return false;
13512
13513 case ISD::BITCAST:
13514 // TODO: This is incorrect as it loses track of the operand's type. We may
13515 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13516 // same bits that are canonicalized in one type need not be in the other.
13517 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13518 case ISD::TRUNCATE: {
13519 // Hack round the mess we make when legalizing extract_vector_elt
13520 if (Op.getValueType() == MVT::i16) {
13521 SDValue TruncSrc = Op.getOperand(0);
13522 if (TruncSrc.getValueType() == MVT::i32 &&
13523 TruncSrc.getOpcode() == ISD::BITCAST &&
13524 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13525 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13526 }
13527 }
13528 return false;
13529 }
13530 case ISD::INTRINSIC_WO_CHAIN: {
13531 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13532 // TODO: Handle more intrinsics
13533 switch (IntrinsicID) {
13534 case Intrinsic::amdgcn_cvt_pkrtz:
13535 case Intrinsic::amdgcn_cubeid:
13536 case Intrinsic::amdgcn_frexp_mant:
13537 case Intrinsic::amdgcn_fdot2:
13538 case Intrinsic::amdgcn_rcp:
13539 case Intrinsic::amdgcn_rsq:
13540 case Intrinsic::amdgcn_rsq_clamp:
13541 case Intrinsic::amdgcn_rcp_legacy:
13542 case Intrinsic::amdgcn_rsq_legacy:
13543 case Intrinsic::amdgcn_trig_preop:
13544 case Intrinsic::amdgcn_log:
13545 case Intrinsic::amdgcn_exp2:
13546 case Intrinsic::amdgcn_sqrt:
13547 return true;
13548 default:
13549 break;
13550 }
13551
13552 break;
13553 }
13554 default:
13555 break;
13556 }
13557
13558 // FIXME: denormalsEnabledForType is broken for dynamic
13559 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13560 DAG.isKnownNeverSNaN(Op);
13561 }
13562
isCanonicalized(Register Reg,const MachineFunction & MF,unsigned MaxDepth) const13563 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
13564 unsigned MaxDepth) const {
13565 const MachineRegisterInfo &MRI = MF.getRegInfo();
13566 MachineInstr *MI = MRI.getVRegDef(Reg);
13567 unsigned Opcode = MI->getOpcode();
13568
13569 if (Opcode == AMDGPU::G_FCANONICALIZE)
13570 return true;
13571
13572 std::optional<FPValueAndVReg> FCR;
13573 // Constant splat (can be padded with undef) or scalar constant.
13574 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13575 if (FCR->Value.isSignaling())
13576 return false;
13577 if (!FCR->Value.isDenormal())
13578 return true;
13579
13580 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13581 return Mode == DenormalMode::getIEEE();
13582 }
13583
13584 if (MaxDepth == 0)
13585 return false;
13586
13587 switch (Opcode) {
13588 case AMDGPU::G_FADD:
13589 case AMDGPU::G_FSUB:
13590 case AMDGPU::G_FMUL:
13591 case AMDGPU::G_FCEIL:
13592 case AMDGPU::G_FFLOOR:
13593 case AMDGPU::G_FRINT:
13594 case AMDGPU::G_FNEARBYINT:
13595 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13596 case AMDGPU::G_INTRINSIC_TRUNC:
13597 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13598 case AMDGPU::G_FMA:
13599 case AMDGPU::G_FMAD:
13600 case AMDGPU::G_FSQRT:
13601 case AMDGPU::G_FDIV:
13602 case AMDGPU::G_FREM:
13603 case AMDGPU::G_FPOW:
13604 case AMDGPU::G_FPEXT:
13605 case AMDGPU::G_FLOG:
13606 case AMDGPU::G_FLOG2:
13607 case AMDGPU::G_FLOG10:
13608 case AMDGPU::G_FPTRUNC:
13609 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13610 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13611 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13612 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13613 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13614 return true;
13615 case AMDGPU::G_FNEG:
13616 case AMDGPU::G_FABS:
13617 case AMDGPU::G_FCOPYSIGN:
13618 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13619 case AMDGPU::G_FMINNUM:
13620 case AMDGPU::G_FMAXNUM:
13621 case AMDGPU::G_FMINNUM_IEEE:
13622 case AMDGPU::G_FMAXNUM_IEEE:
13623 case AMDGPU::G_FMINIMUM:
13624 case AMDGPU::G_FMAXIMUM:
13625 case AMDGPU::G_FMINIMUMNUM:
13626 case AMDGPU::G_FMAXIMUMNUM: {
13627 if (Subtarget->supportsMinMaxDenormModes() ||
13628 // FIXME: denormalsEnabledForType is broken for dynamic
13629 denormalsEnabledForType(MRI.getType(Reg), MF))
13630 return true;
13631
13632 [[fallthrough]];
13633 }
13634 case AMDGPU::G_BUILD_VECTOR:
13635 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13636 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13637 return false;
13638 return true;
13639 case AMDGPU::G_INTRINSIC:
13640 case AMDGPU::G_INTRINSIC_CONVERGENT:
13641 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13642 case Intrinsic::amdgcn_fmul_legacy:
13643 case Intrinsic::amdgcn_fmad_ftz:
13644 case Intrinsic::amdgcn_sqrt:
13645 case Intrinsic::amdgcn_fmed3:
13646 case Intrinsic::amdgcn_sin:
13647 case Intrinsic::amdgcn_cos:
13648 case Intrinsic::amdgcn_log:
13649 case Intrinsic::amdgcn_exp2:
13650 case Intrinsic::amdgcn_log_clamp:
13651 case Intrinsic::amdgcn_rcp:
13652 case Intrinsic::amdgcn_rcp_legacy:
13653 case Intrinsic::amdgcn_rsq:
13654 case Intrinsic::amdgcn_rsq_clamp:
13655 case Intrinsic::amdgcn_rsq_legacy:
13656 case Intrinsic::amdgcn_div_scale:
13657 case Intrinsic::amdgcn_div_fmas:
13658 case Intrinsic::amdgcn_div_fixup:
13659 case Intrinsic::amdgcn_fract:
13660 case Intrinsic::amdgcn_cvt_pkrtz:
13661 case Intrinsic::amdgcn_cubeid:
13662 case Intrinsic::amdgcn_cubema:
13663 case Intrinsic::amdgcn_cubesc:
13664 case Intrinsic::amdgcn_cubetc:
13665 case Intrinsic::amdgcn_frexp_mant:
13666 case Intrinsic::amdgcn_fdot2:
13667 case Intrinsic::amdgcn_trig_preop:
13668 case Intrinsic::amdgcn_tanh:
13669 return true;
13670 default:
13671 break;
13672 }
13673
13674 [[fallthrough]];
13675 default:
13676 return false;
13677 }
13678
13679 llvm_unreachable("invalid operation");
13680 }
13681
13682 // Constant fold canonicalize.
getCanonicalConstantFP(SelectionDAG & DAG,const SDLoc & SL,EVT VT,const APFloat & C) const13683 SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13684 const SDLoc &SL, EVT VT,
13685 const APFloat &C) const {
13686 // Flush denormals to 0 if not enabled.
13687 if (C.isDenormal()) {
13688 DenormalMode Mode =
13689 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13690 if (Mode == DenormalMode::getPreserveSign()) {
13691 return DAG.getConstantFP(
13692 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13693 }
13694
13695 if (Mode != DenormalMode::getIEEE())
13696 return SDValue();
13697 }
13698
13699 if (C.isNaN()) {
13700 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13701 if (C.isSignaling()) {
13702 // Quiet a signaling NaN.
13703 // FIXME: Is this supposed to preserve payload bits?
13704 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13705 }
13706
13707 // Make sure it is the canonical NaN bitpattern.
13708 //
13709 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13710 // immediate?
13711 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13712 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13713 }
13714
13715 // Already canonical.
13716 return DAG.getConstantFP(C, SL, VT);
13717 }
13718
vectorEltWillFoldAway(SDValue Op)13719 static bool vectorEltWillFoldAway(SDValue Op) {
13720 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13721 }
13722
13723 SDValue
performFCanonicalizeCombine(SDNode * N,DAGCombinerInfo & DCI) const13724 SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13725 DAGCombinerInfo &DCI) const {
13726 SelectionDAG &DAG = DCI.DAG;
13727 SDValue N0 = N->getOperand(0);
13728 EVT VT = N->getValueType(0);
13729
13730 // fcanonicalize undef -> qnan
13731 if (N0.isUndef()) {
13732 APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics());
13733 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13734 }
13735
13736 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13737 EVT VT = N->getValueType(0);
13738 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13739 }
13740
13741 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13742 // (fcanonicalize k)
13743 //
13744 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13745
13746 // TODO: This could be better with wider vectors that will be split to v2f16,
13747 // and to consider uses since there aren't that many packed operations.
13748 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13749 isTypeLegal(MVT::v2f16)) {
13750 SDLoc SL(N);
13751 SDValue NewElts[2];
13752 SDValue Lo = N0.getOperand(0);
13753 SDValue Hi = N0.getOperand(1);
13754 EVT EltVT = Lo.getValueType();
13755
13756 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
13757 for (unsigned I = 0; I != 2; ++I) {
13758 SDValue Op = N0.getOperand(I);
13759 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13760 NewElts[I] =
13761 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13762 } else if (Op.isUndef()) {
13763 // Handled below based on what the other operand is.
13764 NewElts[I] = Op;
13765 } else {
13766 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13767 }
13768 }
13769
13770 // If one half is undef, and one is constant, prefer a splat vector rather
13771 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13772 // cheaper to use and may be free with a packed operation.
13773 if (NewElts[0].isUndef()) {
13774 if (isa<ConstantFPSDNode>(NewElts[1]))
13775 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13776 ? NewElts[1]
13777 : DAG.getConstantFP(0.0f, SL, EltVT);
13778 }
13779
13780 if (NewElts[1].isUndef()) {
13781 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13782 ? NewElts[0]
13783 : DAG.getConstantFP(0.0f, SL, EltVT);
13784 }
13785
13786 return DAG.getBuildVector(VT, SL, NewElts);
13787 }
13788 }
13789
13790 return SDValue();
13791 }
13792
minMaxOpcToMin3Max3Opc(unsigned Opc)13793 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13794 switch (Opc) {
13795 case ISD::FMAXNUM:
13796 case ISD::FMAXNUM_IEEE:
13797 case ISD::FMAXIMUMNUM:
13798 return AMDGPUISD::FMAX3;
13799 case ISD::FMAXIMUM:
13800 return AMDGPUISD::FMAXIMUM3;
13801 case ISD::SMAX:
13802 return AMDGPUISD::SMAX3;
13803 case ISD::UMAX:
13804 return AMDGPUISD::UMAX3;
13805 case ISD::FMINNUM:
13806 case ISD::FMINNUM_IEEE:
13807 case ISD::FMINIMUMNUM:
13808 return AMDGPUISD::FMIN3;
13809 case ISD::FMINIMUM:
13810 return AMDGPUISD::FMINIMUM3;
13811 case ISD::SMIN:
13812 return AMDGPUISD::SMIN3;
13813 case ISD::UMIN:
13814 return AMDGPUISD::UMIN3;
13815 default:
13816 llvm_unreachable("Not a min/max opcode");
13817 }
13818 }
13819
performIntMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Src,SDValue MinVal,SDValue MaxVal,bool Signed) const13820 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13821 const SDLoc &SL, SDValue Src,
13822 SDValue MinVal,
13823 SDValue MaxVal,
13824 bool Signed) const {
13825
13826 // med3 comes from
13827 // min(max(x, K0), K1), K0 < K1
13828 // max(min(x, K0), K1), K1 < K0
13829 //
13830 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13831 // min/max op.
13832 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13833 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13834
13835 if (!MinK || !MaxK)
13836 return SDValue();
13837
13838 if (Signed) {
13839 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13840 return SDValue();
13841 } else {
13842 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13843 return SDValue();
13844 }
13845
13846 EVT VT = MinK->getValueType(0);
13847 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13848 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13849 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13850
13851 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13852 // not available, but this is unlikely to be profitable as constants
13853 // will often need to be materialized & extended, especially on
13854 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13855 return SDValue();
13856 }
13857
getSplatConstantFP(SDValue Op)13858 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13859 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13860 return C;
13861
13862 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13863 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13864 return C;
13865 }
13866
13867 return nullptr;
13868 }
13869
performFPMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Op0,SDValue Op1) const13870 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13871 const SDLoc &SL, SDValue Op0,
13872 SDValue Op1) const {
13873 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
13874 if (!K1)
13875 return SDValue();
13876
13877 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
13878 if (!K0)
13879 return SDValue();
13880
13881 // Ordered >= (although NaN inputs should have folded away by now).
13882 if (K0->getValueAPF() > K1->getValueAPF())
13883 return SDValue();
13884
13885 // med3 with a nan input acts like
13886 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
13887 //
13888 // So the result depends on whether the IEEE mode bit is enabled or not with a
13889 // signaling nan input.
13890 // ieee=1
13891 // s0 snan: yields s2
13892 // s1 snan: yields s2
13893 // s2 snan: qnan
13894
13895 // s0 qnan: min(s1, s2)
13896 // s1 qnan: min(s0, s2)
13897 // s2 qnan: min(s0, s1)
13898
13899 // ieee=0
13900 // s0 snan: min(s1, s2)
13901 // s1 snan: min(s0, s2)
13902 // s2 snan: qnan
13903
13904 // s0 qnan: min(s1, s2)
13905 // s1 qnan: min(s0, s2)
13906 // s2 qnan: min(s0, s1)
13907 const MachineFunction &MF = DAG.getMachineFunction();
13908 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13909
13910 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
13911 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
13912 // can only form if op0 is fmaxnum_ieee if IEEE=1.
13913 EVT VT = Op0.getValueType();
13914 if (Info->getMode().DX10Clamp) {
13915 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13916 // hardware fmed3 behavior converting to a min.
13917 // FIXME: Should this be allowing -0.0?
13918 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13919 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13920 }
13921
13922 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13923 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13924 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13925 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13926 // then give the other result, which is different from med3 with a NaN
13927 // input.
13928 SDValue Var = Op0.getOperand(0);
13929 if (!DAG.isKnownNeverSNaN(Var))
13930 return SDValue();
13931
13932 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13933
13934 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13935 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13936 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13937 SDValue(K0, 0), SDValue(K1, 0));
13938 }
13939 }
13940
13941 return SDValue();
13942 }
13943
13944 /// \return true if the subtarget supports minimum3 and maximum3 with the given
13945 /// base min/max opcode \p Opc for type \p VT.
supportsMin3Max3(const GCNSubtarget & Subtarget,unsigned Opc,EVT VT)13946 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13947 EVT VT) {
13948 switch (Opc) {
13949 case ISD::FMINNUM:
13950 case ISD::FMAXNUM:
13951 case ISD::FMINNUM_IEEE:
13952 case ISD::FMAXNUM_IEEE:
13953 case ISD::FMINIMUMNUM:
13954 case ISD::FMAXIMUMNUM:
13955 case AMDGPUISD::FMIN_LEGACY:
13956 case AMDGPUISD::FMAX_LEGACY:
13957 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13958 case ISD::FMINIMUM:
13959 case ISD::FMAXIMUM:
13960 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13961 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
13962 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
13963 case ISD::SMAX:
13964 case ISD::SMIN:
13965 case ISD::UMAX:
13966 case ISD::UMIN:
13967 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13968 default:
13969 return false;
13970 }
13971
13972 llvm_unreachable("not a min/max opcode");
13973 }
13974
performMinMaxCombine(SDNode * N,DAGCombinerInfo & DCI) const13975 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13976 DAGCombinerInfo &DCI) const {
13977 SelectionDAG &DAG = DCI.DAG;
13978
13979 EVT VT = N->getValueType(0);
13980 unsigned Opc = N->getOpcode();
13981 SDValue Op0 = N->getOperand(0);
13982 SDValue Op1 = N->getOperand(1);
13983
13984 // Only do this if the inner op has one use since this will just increases
13985 // register pressure for no benefit.
13986
13987 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13988 // max(max(a, b), c) -> max3(a, b, c)
13989 // min(min(a, b), c) -> min3(a, b, c)
13990 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13991 SDLoc DL(N);
13992 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13993 Op0.getOperand(0), Op0.getOperand(1), Op1);
13994 }
13995
13996 // Try commuted.
13997 // max(a, max(b, c)) -> max3(a, b, c)
13998 // min(a, min(b, c)) -> min3(a, b, c)
13999 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14000 SDLoc DL(N);
14001 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14002 Op0, Op1.getOperand(0), Op1.getOperand(1));
14003 }
14004 }
14005
14006 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14007 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14008 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14009 if (SDValue Med3 = performIntMed3ImmCombine(
14010 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14011 return Med3;
14012 }
14013 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14014 if (SDValue Med3 = performIntMed3ImmCombine(
14015 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14016 return Med3;
14017 }
14018
14019 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14020 if (SDValue Med3 = performIntMed3ImmCombine(
14021 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14022 return Med3;
14023 }
14024 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14025 if (SDValue Med3 = performIntMed3ImmCombine(
14026 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14027 return Med3;
14028 }
14029
14030 // if !is_snan(x):
14031 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14032 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14033 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14034 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14035 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14036 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14037 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14038 (Opc == AMDGPUISD::FMIN_LEGACY &&
14039 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14040 (VT == MVT::f32 || VT == MVT::f64 ||
14041 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14042 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14043 Op0.hasOneUse()) {
14044 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14045 return Res;
14046 }
14047
14048 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14049 // for some types, but at a higher cost since it's implemented with a 3
14050 // operand form.
14051 const SDNodeFlags Flags = N->getFlags();
14052 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14053 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14054 unsigned NewOpc =
14055 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14056 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14057 }
14058
14059 return SDValue();
14060 }
14061
isClampZeroToOne(SDValue A,SDValue B)14062 static bool isClampZeroToOne(SDValue A, SDValue B) {
14063 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
14064 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
14065 // FIXME: Should this be allowing -0.0?
14066 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14067 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14068 }
14069 }
14070
14071 return false;
14072 }
14073
14074 // FIXME: Should only worry about snans for version with chain.
performFMed3Combine(SDNode * N,DAGCombinerInfo & DCI) const14075 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14076 DAGCombinerInfo &DCI) const {
14077 EVT VT = N->getValueType(0);
14078 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14079 // NaNs. With a NaN input, the order of the operands may change the result.
14080
14081 SelectionDAG &DAG = DCI.DAG;
14082 SDLoc SL(N);
14083
14084 SDValue Src0 = N->getOperand(0);
14085 SDValue Src1 = N->getOperand(1);
14086 SDValue Src2 = N->getOperand(2);
14087
14088 if (isClampZeroToOne(Src0, Src1)) {
14089 // const_a, const_b, x -> clamp is safe in all cases including signaling
14090 // nans.
14091 // FIXME: Should this be allowing -0.0?
14092 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14093 }
14094
14095 const MachineFunction &MF = DAG.getMachineFunction();
14096 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14097
14098 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14099 // handling no dx10-clamp?
14100 if (Info->getMode().DX10Clamp) {
14101 // If NaNs is clamped to 0, we are free to reorder the inputs.
14102
14103 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14104 std::swap(Src0, Src1);
14105
14106 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14107 std::swap(Src1, Src2);
14108
14109 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14110 std::swap(Src0, Src1);
14111
14112 if (isClampZeroToOne(Src1, Src2))
14113 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14114 }
14115
14116 return SDValue();
14117 }
14118
performCvtPkRTZCombine(SDNode * N,DAGCombinerInfo & DCI) const14119 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14120 DAGCombinerInfo &DCI) const {
14121 SDValue Src0 = N->getOperand(0);
14122 SDValue Src1 = N->getOperand(1);
14123 if (Src0.isUndef() && Src1.isUndef())
14124 return DCI.DAG.getUNDEF(N->getValueType(0));
14125 return SDValue();
14126 }
14127
14128 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14129 // expanded into a set of cmp/select instructions.
shouldExpandVectorDynExt(unsigned EltSize,unsigned NumElem,bool IsDivergentIdx,const GCNSubtarget * Subtarget)14130 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
14131 unsigned NumElem,
14132 bool IsDivergentIdx,
14133 const GCNSubtarget *Subtarget) {
14134 if (UseDivergentRegisterIndexing)
14135 return false;
14136
14137 unsigned VecSize = EltSize * NumElem;
14138
14139 // Sub-dword vectors of size 2 dword or less have better implementation.
14140 if (VecSize <= 64 && EltSize < 32)
14141 return false;
14142
14143 // Always expand the rest of sub-dword instructions, otherwise it will be
14144 // lowered via memory.
14145 if (EltSize < 32)
14146 return true;
14147
14148 // Always do this if var-idx is divergent, otherwise it will become a loop.
14149 if (IsDivergentIdx)
14150 return true;
14151
14152 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14153 unsigned NumInsts = NumElem /* Number of compares */ +
14154 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14155
14156 // On some architectures (GFX9) movrel is not available and it's better
14157 // to expand.
14158 if (Subtarget->useVGPRIndexMode())
14159 return NumInsts <= 16;
14160
14161 // If movrel is available, use it instead of expanding for vector of 8
14162 // elements.
14163 if (Subtarget->hasMovrel())
14164 return NumInsts <= 15;
14165
14166 return true;
14167 }
14168
shouldExpandVectorDynExt(SDNode * N) const14169 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
14170 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14171 if (isa<ConstantSDNode>(Idx))
14172 return false;
14173
14174 SDValue Vec = N->getOperand(0);
14175 EVT VecVT = Vec.getValueType();
14176 EVT EltVT = VecVT.getVectorElementType();
14177 unsigned EltSize = EltVT.getSizeInBits();
14178 unsigned NumElem = VecVT.getVectorNumElements();
14179
14180 return SITargetLowering::shouldExpandVectorDynExt(
14181 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14182 }
14183
14184 SDValue
performExtractVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const14185 SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14186 DAGCombinerInfo &DCI) const {
14187 SDValue Vec = N->getOperand(0);
14188 SelectionDAG &DAG = DCI.DAG;
14189
14190 EVT VecVT = Vec.getValueType();
14191 EVT VecEltVT = VecVT.getVectorElementType();
14192 EVT ResVT = N->getValueType(0);
14193
14194 unsigned VecSize = VecVT.getSizeInBits();
14195 unsigned VecEltSize = VecEltVT.getSizeInBits();
14196
14197 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14198 allUsesHaveSourceMods(N)) {
14199 SDLoc SL(N);
14200 SDValue Idx = N->getOperand(1);
14201 SDValue Elt =
14202 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14203 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14204 }
14205
14206 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14207 // =>
14208 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14209 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14210 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14211 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14212 SDLoc SL(N);
14213 SDValue Idx = N->getOperand(1);
14214 unsigned Opc = Vec.getOpcode();
14215
14216 switch (Opc) {
14217 default:
14218 break;
14219 // TODO: Support other binary operations.
14220 case ISD::FADD:
14221 case ISD::FSUB:
14222 case ISD::FMUL:
14223 case ISD::ADD:
14224 case ISD::UMIN:
14225 case ISD::UMAX:
14226 case ISD::SMIN:
14227 case ISD::SMAX:
14228 case ISD::FMAXNUM:
14229 case ISD::FMINNUM:
14230 case ISD::FMAXNUM_IEEE:
14231 case ISD::FMINNUM_IEEE:
14232 case ISD::FMAXIMUM:
14233 case ISD::FMINIMUM: {
14234 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14235 Vec.getOperand(0), Idx);
14236 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14237 Vec.getOperand(1), Idx);
14238
14239 DCI.AddToWorklist(Elt0.getNode());
14240 DCI.AddToWorklist(Elt1.getNode());
14241 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14242 }
14243 }
14244 }
14245
14246 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14247 if (shouldExpandVectorDynExt(N)) {
14248 SDLoc SL(N);
14249 SDValue Idx = N->getOperand(1);
14250 SDValue V;
14251 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14252 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14253 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14254 if (I == 0)
14255 V = Elt;
14256 else
14257 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14258 }
14259 return V;
14260 }
14261
14262 if (!DCI.isBeforeLegalize())
14263 return SDValue();
14264
14265 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14266 // elements. This exposes more load reduction opportunities by replacing
14267 // multiple small extract_vector_elements with a single 32-bit extract.
14268 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14269 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14270 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14271 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14272
14273 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14274 unsigned EltIdx = BitIndex / 32;
14275 unsigned LeftoverBitIdx = BitIndex % 32;
14276 SDLoc SL(N);
14277
14278 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14279 DCI.AddToWorklist(Cast.getNode());
14280
14281 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14282 DAG.getConstant(EltIdx, SL, MVT::i32));
14283 DCI.AddToWorklist(Elt.getNode());
14284 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14285 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14286 DCI.AddToWorklist(Srl.getNode());
14287
14288 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14289 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14290 DCI.AddToWorklist(Trunc.getNode());
14291
14292 if (VecEltVT == ResVT) {
14293 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14294 }
14295
14296 assert(ResVT.isScalarInteger());
14297 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14298 }
14299
14300 return SDValue();
14301 }
14302
14303 SDValue
performInsertVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const14304 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14305 DAGCombinerInfo &DCI) const {
14306 SDValue Vec = N->getOperand(0);
14307 SDValue Idx = N->getOperand(2);
14308 EVT VecVT = Vec.getValueType();
14309 EVT EltVT = VecVT.getVectorElementType();
14310
14311 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14312 // => BUILD_VECTOR n x select (e, const-idx)
14313 if (!shouldExpandVectorDynExt(N))
14314 return SDValue();
14315
14316 SelectionDAG &DAG = DCI.DAG;
14317 SDLoc SL(N);
14318 SDValue Ins = N->getOperand(1);
14319 EVT IdxVT = Idx.getValueType();
14320
14321 SmallVector<SDValue, 16> Ops;
14322 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14323 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14324 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14325 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14326 Ops.push_back(V);
14327 }
14328
14329 return DAG.getBuildVector(VecVT, SL, Ops);
14330 }
14331
14332 /// Return the source of an fp_extend from f16 to f32, or a converted FP
14333 /// constant.
strictFPExtFromF16(SelectionDAG & DAG,SDValue Src)14334 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
14335 if (Src.getOpcode() == ISD::FP_EXTEND &&
14336 Src.getOperand(0).getValueType() == MVT::f16) {
14337 return Src.getOperand(0);
14338 }
14339
14340 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14341 APFloat Val = CFP->getValueAPF();
14342 bool LosesInfo = true;
14343 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
14344 if (!LosesInfo)
14345 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14346 }
14347
14348 return SDValue();
14349 }
14350
performFPRoundCombine(SDNode * N,DAGCombinerInfo & DCI) const14351 SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14352 DAGCombinerInfo &DCI) const {
14353 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14354 "combine only useful on gfx8");
14355
14356 SDValue TruncSrc = N->getOperand(0);
14357 EVT VT = N->getValueType(0);
14358 if (VT != MVT::f16)
14359 return SDValue();
14360
14361 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14362 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14363 return SDValue();
14364
14365 SelectionDAG &DAG = DCI.DAG;
14366 SDLoc SL(N);
14367
14368 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14369 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14370 // casting back.
14371
14372 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14373 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14374 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14375 if (!A)
14376 return SDValue();
14377
14378 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14379 if (!B)
14380 return SDValue();
14381
14382 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14383 if (!C)
14384 return SDValue();
14385
14386 // This changes signaling nan behavior. If an input is a signaling nan, it
14387 // would have been quieted by the fpext originally. We don't care because
14388 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14389 // we would be worse off than just doing the promotion.
14390 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14391 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14392 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14393 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14394 }
14395
getFusedOpcode(const SelectionDAG & DAG,const SDNode * N0,const SDNode * N1) const14396 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14397 const SDNode *N0,
14398 const SDNode *N1) const {
14399 EVT VT = N0->getValueType(0);
14400
14401 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14402 // support denormals ever.
14403 if (((VT == MVT::f32 &&
14404 denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
14405 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14406 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
14407 isOperationLegal(ISD::FMAD, VT))
14408 return ISD::FMAD;
14409
14410 const TargetOptions &Options = DAG.getTarget().Options;
14411 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14412 (N0->getFlags().hasAllowContract() &&
14413 N1->getFlags().hasAllowContract())) &&
14414 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
14415 return ISD::FMA;
14416 }
14417
14418 return 0;
14419 }
14420
14421 // For a reassociatable opcode perform:
14422 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
reassociateScalarOps(SDNode * N,SelectionDAG & DAG) const14423 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14424 SelectionDAG &DAG) const {
14425 EVT VT = N->getValueType(0);
14426 if (VT != MVT::i32 && VT != MVT::i64)
14427 return SDValue();
14428
14429 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
14430 return SDValue();
14431
14432 unsigned Opc = N->getOpcode();
14433 SDValue Op0 = N->getOperand(0);
14434 SDValue Op1 = N->getOperand(1);
14435
14436 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14437 return SDValue();
14438
14439 if (Op0->isDivergent())
14440 std::swap(Op0, Op1);
14441
14442 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14443 return SDValue();
14444
14445 SDValue Op2 = Op1.getOperand(1);
14446 Op1 = Op1.getOperand(0);
14447 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14448 return SDValue();
14449
14450 if (Op1->isDivergent())
14451 std::swap(Op1, Op2);
14452
14453 SDLoc SL(N);
14454 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
14455 return DAG.getNode(Opc, SL, VT, Add1, Op2);
14456 }
14457
getMad64_32(SelectionDAG & DAG,const SDLoc & SL,EVT VT,SDValue N0,SDValue N1,SDValue N2,bool Signed)14458 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14459 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14460 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
14461 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
14462 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
14463 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
14464 }
14465
14466 // Fold
14467 // y = lshr i64 x, 32
14468 // res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14469 // with Const.hi == -1
14470 // To
14471 // res = mad_u64_u32 y.lo ,Const.lo, x.lo
tryFoldMADwithSRL(SelectionDAG & DAG,const SDLoc & SL,SDValue MulLHS,SDValue MulRHS,SDValue AddRHS)14472 static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
14473 SDValue MulLHS, SDValue MulRHS,
14474 SDValue AddRHS) {
14475 if (MulRHS.getOpcode() == ISD::SRL)
14476 std::swap(MulLHS, MulRHS);
14477
14478 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14479 return SDValue();
14480
14481 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
14482 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14483 MulLHS.getOperand(0) != AddRHS)
14484 return SDValue();
14485
14486 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
14487 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
14488 return SDValue();
14489
14490 SDValue ConstMul =
14491 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14492 return getMad64_32(DAG, SL, MVT::i64,
14493 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14494 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14495 }
14496
14497 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14498 // multiplies, if any.
14499 //
14500 // Full 64-bit multiplies that feed into an addition are lowered here instead
14501 // of using the generic expansion. The generic expansion ends up with
14502 // a tree of ADD nodes that prevents us from using the "add" part of the
14503 // MAD instruction. The expansion produced here results in a chain of ADDs
14504 // instead of a tree.
tryFoldToMad64_32(SDNode * N,DAGCombinerInfo & DCI) const14505 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14506 DAGCombinerInfo &DCI) const {
14507 assert(N->getOpcode() == ISD::ADD);
14508
14509 SelectionDAG &DAG = DCI.DAG;
14510 EVT VT = N->getValueType(0);
14511 SDLoc SL(N);
14512 SDValue LHS = N->getOperand(0);
14513 SDValue RHS = N->getOperand(1);
14514
14515 if (VT.isVector())
14516 return SDValue();
14517
14518 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14519 // result in scalar registers for uniform values.
14520 if (!N->isDivergent() && Subtarget->hasSMulHi())
14521 return SDValue();
14522
14523 unsigned NumBits = VT.getScalarSizeInBits();
14524 if (NumBits <= 32 || NumBits > 64)
14525 return SDValue();
14526
14527 if (LHS.getOpcode() != ISD::MUL) {
14528 assert(RHS.getOpcode() == ISD::MUL);
14529 std::swap(LHS, RHS);
14530 }
14531
14532 // Avoid the fold if it would unduly increase the number of multiplies due to
14533 // multiple uses, except on hardware with full-rate multiply-add (which is
14534 // part of full-rate 64-bit ops).
14535 if (!Subtarget->hasFullRate64Ops()) {
14536 unsigned NumUsers = 0;
14537 for (SDNode *User : LHS->users()) {
14538 // There is a use that does not feed into addition, so the multiply can't
14539 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14540 if (User->getOpcode() != ISD::ADD)
14541 return SDValue();
14542
14543 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14544 // MUL + 3xADD + 3xADDC over 3xMAD.
14545 ++NumUsers;
14546 if (NumUsers >= 3)
14547 return SDValue();
14548 }
14549 }
14550
14551 SDValue MulLHS = LHS.getOperand(0);
14552 SDValue MulRHS = LHS.getOperand(1);
14553 SDValue AddRHS = RHS;
14554
14555 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14556 return FoldedMAD;
14557
14558 // Always check whether operands are small unsigned values, since that
14559 // knowledge is useful in more cases. Check for small signed values only if
14560 // doing so can unlock a shorter code sequence.
14561 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14562 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14563
14564 bool MulSignedLo = false;
14565 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14566 MulSignedLo =
14567 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14568 }
14569
14570 // The operands and final result all have the same number of bits. If
14571 // operands need to be extended, they can be extended with garbage. The
14572 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14573 // truncated away in the end.
14574 if (VT != MVT::i64) {
14575 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14576 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14577 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14578 }
14579
14580 // The basic code generated is conceptually straightforward. Pseudo code:
14581 //
14582 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14583 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14584 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14585 //
14586 // The second and third lines are optional, depending on whether the factors
14587 // are {sign,zero}-extended or not.
14588 //
14589 // The actual DAG is noisier than the pseudo code, but only due to
14590 // instructions that disassemble values into low and high parts, and
14591 // assemble the final result.
14592 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14593
14594 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14595 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14596 SDValue Accum =
14597 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14598
14599 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14600 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14601
14602 if (!MulLHSUnsigned32) {
14603 auto MulLHSHi =
14604 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14605 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14606 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14607 }
14608
14609 if (!MulRHSUnsigned32) {
14610 auto MulRHSHi =
14611 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14612 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14613 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14614 }
14615
14616 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14617 Accum = DAG.getBitcast(MVT::i64, Accum);
14618 }
14619
14620 if (VT != MVT::i64)
14621 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14622 return Accum;
14623 }
14624
14625 SDValue
foldAddSub64WithZeroLowBitsTo32(SDNode * N,DAGCombinerInfo & DCI) const14626 SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14627 DAGCombinerInfo &DCI) const {
14628 SDValue RHS = N->getOperand(1);
14629 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14630 if (!CRHS)
14631 return SDValue();
14632
14633 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14634 // common.
14635 uint64_t Val = CRHS->getZExtValue();
14636 if (countr_zero(Val) >= 32) {
14637 SelectionDAG &DAG = DCI.DAG;
14638 SDLoc SL(N);
14639 SDValue LHS = N->getOperand(0);
14640
14641 // Avoid carry machinery if we know the low half of the add does not
14642 // contribute to the final result.
14643 //
14644 // add i64:x, K if computeTrailingZeros(K) >= 32
14645 // => build_pair (add x.hi, K.hi), x.lo
14646
14647 // Breaking the 64-bit add here with this strange constant is unlikely
14648 // to interfere with addressing mode patterns.
14649
14650 SDValue Hi = getHiHalf64(LHS, DAG);
14651 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14652 SDValue AddHi =
14653 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14654
14655 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14656 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14657 }
14658
14659 return SDValue();
14660 }
14661
14662 // Collect the ultimate src of each of the mul node's operands, and confirm
14663 // each operand is 8 bytes.
14664 static std::optional<ByteProvider<SDValue>>
handleMulOperand(const SDValue & MulOperand)14665 handleMulOperand(const SDValue &MulOperand) {
14666 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14667 if (!Byte0 || Byte0->isConstantZero()) {
14668 return std::nullopt;
14669 }
14670 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14671 if (Byte1 && !Byte1->isConstantZero()) {
14672 return std::nullopt;
14673 }
14674 return Byte0;
14675 }
14676
addPermMasks(unsigned First,unsigned Second)14677 static unsigned addPermMasks(unsigned First, unsigned Second) {
14678 unsigned FirstCs = First & 0x0c0c0c0c;
14679 unsigned SecondCs = Second & 0x0c0c0c0c;
14680 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14681 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14682
14683 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14684 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14685 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14686 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14687
14688 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14689 }
14690
14691 struct DotSrc {
14692 SDValue SrcOp;
14693 int64_t PermMask;
14694 int64_t DWordOffset;
14695 };
14696
placeSources(ByteProvider<SDValue> & Src0,ByteProvider<SDValue> & Src1,SmallVectorImpl<DotSrc> & Src0s,SmallVectorImpl<DotSrc> & Src1s,int Step)14697 static void placeSources(ByteProvider<SDValue> &Src0,
14698 ByteProvider<SDValue> &Src1,
14699 SmallVectorImpl<DotSrc> &Src0s,
14700 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14701
14702 assert(Src0.Src.has_value() && Src1.Src.has_value());
14703 // Src0s and Src1s are empty, just place arbitrarily.
14704 if (Step == 0) {
14705 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14706 Src0.SrcOffset / 4});
14707 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14708 Src1.SrcOffset / 4});
14709 return;
14710 }
14711
14712 for (int BPI = 0; BPI < 2; BPI++) {
14713 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14714 if (BPI == 1) {
14715 BPP = {Src1, Src0};
14716 }
14717 unsigned ZeroMask = 0x0c0c0c0c;
14718 unsigned FMask = 0xFF << (8 * (3 - Step));
14719
14720 unsigned FirstMask =
14721 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14722 unsigned SecondMask =
14723 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14724 // Attempt to find Src vector which contains our SDValue, if so, add our
14725 // perm mask to the existing one. If we are unable to find a match for the
14726 // first SDValue, attempt to find match for the second.
14727 int FirstGroup = -1;
14728 for (int I = 0; I < 2; I++) {
14729 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14730 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14731 return IterElt.SrcOp == *BPP.first.Src &&
14732 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14733 };
14734
14735 auto *Match = llvm::find_if(Srcs, MatchesFirst);
14736 if (Match != Srcs.end()) {
14737 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14738 FirstGroup = I;
14739 break;
14740 }
14741 }
14742 if (FirstGroup != -1) {
14743 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14744 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14745 return IterElt.SrcOp == *BPP.second.Src &&
14746 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14747 };
14748 auto *Match = llvm::find_if(Srcs, MatchesSecond);
14749 if (Match != Srcs.end()) {
14750 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14751 } else
14752 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14753 return;
14754 }
14755 }
14756
14757 // If we have made it here, then we could not find a match in Src0s or Src1s
14758 // for either Src0 or Src1, so just place them arbitrarily.
14759
14760 unsigned ZeroMask = 0x0c0c0c0c;
14761 unsigned FMask = 0xFF << (8 * (3 - Step));
14762
14763 Src0s.push_back(
14764 {*Src0.Src,
14765 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14766 Src0.SrcOffset / 4});
14767 Src1s.push_back(
14768 {*Src1.Src,
14769 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14770 Src1.SrcOffset / 4});
14771 }
14772
resolveSources(SelectionDAG & DAG,SDLoc SL,SmallVectorImpl<DotSrc> & Srcs,bool IsSigned,bool IsAny)14773 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
14774 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14775 bool IsAny) {
14776
14777 // If we just have one source, just permute it accordingly.
14778 if (Srcs.size() == 1) {
14779 auto *Elt = Srcs.begin();
14780 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14781
14782 // v_perm will produce the original value
14783 if (Elt->PermMask == 0x3020100)
14784 return EltOp;
14785
14786 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14787 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14788 }
14789
14790 auto *FirstElt = Srcs.begin();
14791 auto *SecondElt = std::next(FirstElt);
14792
14793 SmallVector<SDValue, 2> Perms;
14794
14795 // If we have multiple sources in the chain, combine them via perms (using
14796 // calculated perm mask) and Ors.
14797 while (true) {
14798 auto FirstMask = FirstElt->PermMask;
14799 auto SecondMask = SecondElt->PermMask;
14800
14801 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14802 unsigned FirstPlusFour = FirstMask | 0x04040404;
14803 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14804 // original 0x0C.
14805 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14806
14807 auto PermMask = addPermMasks(FirstMask, SecondMask);
14808 auto FirstVal =
14809 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14810 auto SecondVal =
14811 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14812
14813 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14814 SecondVal,
14815 DAG.getConstant(PermMask, SL, MVT::i32)));
14816
14817 FirstElt = std::next(SecondElt);
14818 if (FirstElt == Srcs.end())
14819 break;
14820
14821 SecondElt = std::next(FirstElt);
14822 // If we only have a FirstElt, then just combine that into the cumulative
14823 // source node.
14824 if (SecondElt == Srcs.end()) {
14825 auto EltOp =
14826 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14827
14828 Perms.push_back(
14829 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14830 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14831 break;
14832 }
14833 }
14834
14835 assert(Perms.size() == 1 || Perms.size() == 2);
14836 return Perms.size() == 2
14837 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14838 : Perms[0];
14839 }
14840
fixMasks(SmallVectorImpl<DotSrc> & Srcs,unsigned ChainLength)14841 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14842 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14843 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14844 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14845 EntryMask += ZeroMask;
14846 }
14847 }
14848
isMul(const SDValue Op)14849 static bool isMul(const SDValue Op) {
14850 auto Opcode = Op.getOpcode();
14851
14852 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14853 Opcode == AMDGPUISD::MUL_I24);
14854 }
14855
14856 static std::optional<bool>
checkDot4MulSignedness(const SDValue & N,ByteProvider<SDValue> & Src0,ByteProvider<SDValue> & Src1,const SDValue & S0Op,const SDValue & S1Op,const SelectionDAG & DAG)14857 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
14858 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14859 const SDValue &S1Op, const SelectionDAG &DAG) {
14860 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14861 // of the dot4 is irrelevant.
14862 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14863 return false;
14864
14865 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14866 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14867 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14868 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14869 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14870 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14871
14872 assert(!(S0IsUnsigned && S0IsSigned));
14873 assert(!(S1IsUnsigned && S1IsSigned));
14874
14875 // There are 9 possible permutations of
14876 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14877
14878 // In two permutations, the sign bits are known to be the same for both Ops,
14879 // so simply return Signed / Unsigned corresponding to the MSB
14880
14881 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14882 return S0IsSigned;
14883
14884 // In another two permutations, the sign bits are known to be opposite. In
14885 // this case return std::nullopt to indicate a bad match.
14886
14887 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14888 return std::nullopt;
14889
14890 // In the remaining five permutations, we don't know the value of the sign
14891 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14892 // the upper bits must be extension bits. Thus, the only ways for the sign
14893 // bit to be unknown is if it was sign extended from unknown value, or if it
14894 // was any extended. In either case, it is correct to use the signed
14895 // version of the signedness semantics of dot4
14896
14897 // In two of such permutations, we known the sign bit is set for
14898 // one op, and the other is unknown. It is okay to used signed version of
14899 // dot4.
14900 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14901 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14902 return true;
14903
14904 // In one such permutation, we don't know either of the sign bits. It is okay
14905 // to used the signed version of dot4.
14906 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14907 return true;
14908
14909 // In two of such permutations, we known the sign bit is unset for
14910 // one op, and the other is unknown. Return std::nullopt to indicate a
14911 // bad match.
14912 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14913 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14914 return std::nullopt;
14915
14916 llvm_unreachable("Fully covered condition");
14917 }
14918
performAddCombine(SDNode * N,DAGCombinerInfo & DCI) const14919 SDValue SITargetLowering::performAddCombine(SDNode *N,
14920 DAGCombinerInfo &DCI) const {
14921 SelectionDAG &DAG = DCI.DAG;
14922 EVT VT = N->getValueType(0);
14923 SDLoc SL(N);
14924 SDValue LHS = N->getOperand(0);
14925 SDValue RHS = N->getOperand(1);
14926
14927 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14928 if (Subtarget->hasMad64_32()) {
14929 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14930 return Folded;
14931 }
14932 }
14933
14934 if (SDValue V = reassociateScalarOps(N, DAG)) {
14935 return V;
14936 }
14937
14938 if (VT == MVT::i64) {
14939 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14940 return Folded;
14941 }
14942
14943 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14944 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14945 SDValue TempNode(N, 0);
14946 std::optional<bool> IsSigned;
14947 SmallVector<DotSrc, 4> Src0s;
14948 SmallVector<DotSrc, 4> Src1s;
14949 SmallVector<SDValue, 4> Src2s;
14950
14951 // Match the v_dot4 tree, while collecting src nodes.
14952 int ChainLength = 0;
14953 for (int I = 0; I < 4; I++) {
14954 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14955 if (MulIdx == -1)
14956 break;
14957 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14958 if (!Src0)
14959 break;
14960 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14961 if (!Src1)
14962 break;
14963
14964 auto IterIsSigned = checkDot4MulSignedness(
14965 TempNode->getOperand(MulIdx), *Src0, *Src1,
14966 TempNode->getOperand(MulIdx)->getOperand(0),
14967 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14968 if (!IterIsSigned)
14969 break;
14970 if (!IsSigned)
14971 IsSigned = *IterIsSigned;
14972 if (*IterIsSigned != *IsSigned)
14973 break;
14974 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14975 auto AddIdx = 1 - MulIdx;
14976 // Allow the special case where add (add (mul24, 0), mul24) became ->
14977 // add (mul24, mul24).
14978 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14979 Src2s.push_back(TempNode->getOperand(AddIdx));
14980 auto Src0 =
14981 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14982 if (!Src0)
14983 break;
14984 auto Src1 =
14985 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14986 if (!Src1)
14987 break;
14988 auto IterIsSigned = checkDot4MulSignedness(
14989 TempNode->getOperand(AddIdx), *Src0, *Src1,
14990 TempNode->getOperand(AddIdx)->getOperand(0),
14991 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14992 if (!IterIsSigned)
14993 break;
14994 assert(IsSigned);
14995 if (*IterIsSigned != *IsSigned)
14996 break;
14997 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14998 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14999 ChainLength = I + 2;
15000 break;
15001 }
15002
15003 TempNode = TempNode->getOperand(AddIdx);
15004 Src2s.push_back(TempNode);
15005 ChainLength = I + 1;
15006 if (TempNode->getNumOperands() < 2)
15007 break;
15008 LHS = TempNode->getOperand(0);
15009 RHS = TempNode->getOperand(1);
15010 }
15011
15012 if (ChainLength < 2)
15013 return SDValue();
15014
15015 // Masks were constructed with assumption that we would find a chain of
15016 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15017 // 0x0c) so they do not affect dot calculation.
15018 if (ChainLength < 4) {
15019 fixMasks(Src0s, ChainLength);
15020 fixMasks(Src1s, ChainLength);
15021 }
15022
15023 SDValue Src0, Src1;
15024
15025 // If we are just using a single source for both, and have permuted the
15026 // bytes consistently, we can just use the sources without permuting
15027 // (commutation).
15028 bool UseOriginalSrc = false;
15029 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15030 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15031 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15032 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15033 SmallVector<unsigned, 4> SrcBytes;
15034 auto Src0Mask = Src0s.begin()->PermMask;
15035 SrcBytes.push_back(Src0Mask & 0xFF000000);
15036 bool UniqueEntries = true;
15037 for (auto I = 1; I < 4; I++) {
15038 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15039
15040 if (is_contained(SrcBytes, NextByte)) {
15041 UniqueEntries = false;
15042 break;
15043 }
15044 SrcBytes.push_back(NextByte);
15045 }
15046
15047 if (UniqueEntries) {
15048 UseOriginalSrc = true;
15049
15050 auto *FirstElt = Src0s.begin();
15051 auto FirstEltOp =
15052 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15053
15054 auto *SecondElt = Src1s.begin();
15055 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15056 SecondElt->DWordOffset);
15057
15058 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15059 MVT::getIntegerVT(32));
15060 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15061 MVT::getIntegerVT(32));
15062 }
15063 }
15064
15065 if (!UseOriginalSrc) {
15066 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15067 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15068 }
15069
15070 assert(IsSigned);
15071 SDValue Src2 =
15072 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15073
15074 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15075 : Intrinsic::amdgcn_udot4,
15076 SL, MVT::i64);
15077
15078 assert(!VT.isVector());
15079 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15080 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15081
15082 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15083 }
15084
15085 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15086 return SDValue();
15087
15088 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15089 // add x, sext (setcc) => usubo_carry x, 0, setcc
15090 unsigned Opc = LHS.getOpcode();
15091 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
15092 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
15093 std::swap(RHS, LHS);
15094
15095 Opc = RHS.getOpcode();
15096 switch (Opc) {
15097 default:
15098 break;
15099 case ISD::ZERO_EXTEND:
15100 case ISD::SIGN_EXTEND:
15101 case ISD::ANY_EXTEND: {
15102 auto Cond = RHS.getOperand(0);
15103 // If this won't be a real VOPC output, we would still need to insert an
15104 // extra instruction anyway.
15105 if (!isBoolSGPR(Cond))
15106 break;
15107 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15108 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15109 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
15110 return DAG.getNode(Opc, SL, VTList, Args);
15111 }
15112 case ISD::UADDO_CARRY: {
15113 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15114 if (!isNullConstant(RHS.getOperand(1)))
15115 break;
15116 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15117 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15118 }
15119 }
15120 return SDValue();
15121 }
15122
performPtrAddCombine(SDNode * N,DAGCombinerInfo & DCI) const15123 SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15124 DAGCombinerInfo &DCI) const {
15125 SelectionDAG &DAG = DCI.DAG;
15126 SDLoc DL(N);
15127 SDValue N0 = N->getOperand(0);
15128 SDValue N1 = N->getOperand(1);
15129
15130 if (N1.getOpcode() == ISD::ADD) {
15131 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15132 // y is not, and (add y, z) is used only once.
15133 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15134 // z is not, and (add y, z) is used only once.
15135 // The goal is to move constant offsets to the outermost ptradd, to create
15136 // more opportunities to fold offsets into memory instructions.
15137 // Together with the generic combines in DAGCombiner.cpp, this also
15138 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15139 //
15140 // This transform is here instead of in the general DAGCombiner as it can
15141 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15142 // AArch64's CPA.
15143 SDValue X = N0;
15144 SDValue Y = N1.getOperand(0);
15145 SDValue Z = N1.getOperand(1);
15146 if (N1.hasOneUse()) {
15147 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15148 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15149 if (ZIsConstant != YIsConstant) {
15150 // If both additions in the original were NUW, the new ones are as well.
15151 SDNodeFlags Flags =
15152 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15153 if (YIsConstant)
15154 std::swap(Y, Z);
15155
15156 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
15157 DCI.AddToWorklist(Inner.getNode());
15158 return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
15159 }
15160 }
15161 }
15162
15163 return SDValue();
15164 }
15165
performSubCombine(SDNode * N,DAGCombinerInfo & DCI) const15166 SDValue SITargetLowering::performSubCombine(SDNode *N,
15167 DAGCombinerInfo &DCI) const {
15168 SelectionDAG &DAG = DCI.DAG;
15169 EVT VT = N->getValueType(0);
15170
15171 if (VT == MVT::i64) {
15172 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15173 return Folded;
15174 }
15175
15176 if (VT != MVT::i32)
15177 return SDValue();
15178
15179 SDLoc SL(N);
15180 SDValue LHS = N->getOperand(0);
15181 SDValue RHS = N->getOperand(1);
15182
15183 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15184 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15185 unsigned Opc = RHS.getOpcode();
15186 switch (Opc) {
15187 default:
15188 break;
15189 case ISD::ZERO_EXTEND:
15190 case ISD::SIGN_EXTEND:
15191 case ISD::ANY_EXTEND: {
15192 auto Cond = RHS.getOperand(0);
15193 // If this won't be a real VOPC output, we would still need to insert an
15194 // extra instruction anyway.
15195 if (!isBoolSGPR(Cond))
15196 break;
15197 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15198 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15199 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
15200 return DAG.getNode(Opc, SL, VTList, Args);
15201 }
15202 }
15203
15204 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15205 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15206 if (!isNullConstant(LHS.getOperand(1)))
15207 return SDValue();
15208 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15209 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15210 }
15211 return SDValue();
15212 }
15213
15214 SDValue
performAddCarrySubCarryCombine(SDNode * N,DAGCombinerInfo & DCI) const15215 SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15216 DAGCombinerInfo &DCI) const {
15217
15218 if (N->getValueType(0) != MVT::i32)
15219 return SDValue();
15220
15221 if (!isNullConstant(N->getOperand(1)))
15222 return SDValue();
15223
15224 SelectionDAG &DAG = DCI.DAG;
15225 SDValue LHS = N->getOperand(0);
15226
15227 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15228 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15229 unsigned LHSOpc = LHS.getOpcode();
15230 unsigned Opc = N->getOpcode();
15231 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15232 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15233 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15234 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15235 }
15236 return SDValue();
15237 }
15238
performFAddCombine(SDNode * N,DAGCombinerInfo & DCI) const15239 SDValue SITargetLowering::performFAddCombine(SDNode *N,
15240 DAGCombinerInfo &DCI) const {
15241 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15242 return SDValue();
15243
15244 SelectionDAG &DAG = DCI.DAG;
15245 EVT VT = N->getValueType(0);
15246
15247 SDLoc SL(N);
15248 SDValue LHS = N->getOperand(0);
15249 SDValue RHS = N->getOperand(1);
15250
15251 // These should really be instruction patterns, but writing patterns with
15252 // source modifiers is a pain.
15253
15254 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15255 if (LHS.getOpcode() == ISD::FADD) {
15256 SDValue A = LHS.getOperand(0);
15257 if (A == LHS.getOperand(1)) {
15258 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15259 if (FusedOp != 0) {
15260 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15261 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15262 }
15263 }
15264 }
15265
15266 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15267 if (RHS.getOpcode() == ISD::FADD) {
15268 SDValue A = RHS.getOperand(0);
15269 if (A == RHS.getOperand(1)) {
15270 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15271 if (FusedOp != 0) {
15272 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15273 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15274 }
15275 }
15276 }
15277
15278 return SDValue();
15279 }
15280
performFSubCombine(SDNode * N,DAGCombinerInfo & DCI) const15281 SDValue SITargetLowering::performFSubCombine(SDNode *N,
15282 DAGCombinerInfo &DCI) const {
15283 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15284 return SDValue();
15285
15286 SelectionDAG &DAG = DCI.DAG;
15287 SDLoc SL(N);
15288 EVT VT = N->getValueType(0);
15289 assert(!VT.isVector());
15290
15291 // Try to get the fneg to fold into the source modifier. This undoes generic
15292 // DAG combines and folds them into the mad.
15293 //
15294 // Only do this if we are not trying to support denormals. v_mad_f32 does
15295 // not support denormals ever.
15296 SDValue LHS = N->getOperand(0);
15297 SDValue RHS = N->getOperand(1);
15298 if (LHS.getOpcode() == ISD::FADD) {
15299 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15300 SDValue A = LHS.getOperand(0);
15301 if (A == LHS.getOperand(1)) {
15302 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15303 if (FusedOp != 0) {
15304 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15305 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15306
15307 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15308 }
15309 }
15310 }
15311
15312 if (RHS.getOpcode() == ISD::FADD) {
15313 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15314
15315 SDValue A = RHS.getOperand(0);
15316 if (A == RHS.getOperand(1)) {
15317 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15318 if (FusedOp != 0) {
15319 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15320 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15321 }
15322 }
15323 }
15324
15325 return SDValue();
15326 }
15327
performFDivCombine(SDNode * N,DAGCombinerInfo & DCI) const15328 SDValue SITargetLowering::performFDivCombine(SDNode *N,
15329 DAGCombinerInfo &DCI) const {
15330 SelectionDAG &DAG = DCI.DAG;
15331 SDLoc SL(N);
15332 EVT VT = N->getValueType(0);
15333 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
15334 return SDValue();
15335
15336 SDValue LHS = N->getOperand(0);
15337 SDValue RHS = N->getOperand(1);
15338
15339 SDNodeFlags Flags = N->getFlags();
15340 SDNodeFlags RHSFlags = RHS->getFlags();
15341 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15342 !RHS->hasOneUse())
15343 return SDValue();
15344
15345 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
15346 bool IsNegative = false;
15347 if (CLHS->isExactlyValue(1.0) ||
15348 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15349 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15350 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15351 if (RHS.getOpcode() == ISD::FSQRT) {
15352 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15353 SDValue Rsq =
15354 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
15355 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15356 }
15357 }
15358 }
15359
15360 return SDValue();
15361 }
15362
performFMulCombine(SDNode * N,DAGCombinerInfo & DCI) const15363 SDValue SITargetLowering::performFMulCombine(SDNode *N,
15364 DAGCombinerInfo &DCI) const {
15365 SelectionDAG &DAG = DCI.DAG;
15366 EVT VT = N->getValueType(0);
15367 EVT ScalarVT = VT.getScalarType();
15368 EVT IntVT = VT.changeElementType(MVT::i32);
15369
15370 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15371 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15372 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15373 return SDValue();
15374 }
15375
15376 SDValue LHS = N->getOperand(0);
15377 SDValue RHS = N->getOperand(1);
15378
15379 // It is cheaper to realize i32 inline constants as compared against
15380 // materializing f16 or f64 (or even non-inline f32) values,
15381 // possible via ldexp usage, as shown below :
15382 //
15383 // Given : A = 2^a & B = 2^b ; where a and b are integers.
15384 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15385 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15386 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15387 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15388 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
15389 if (!TrueNode)
15390 return SDValue();
15391 const ConstantFPSDNode *FalseNode =
15392 isConstOrConstSplatFP(RHS.getOperand(2));
15393 if (!FalseNode)
15394 return SDValue();
15395
15396 if (TrueNode->isNegative() != FalseNode->isNegative())
15397 return SDValue();
15398
15399 // For f32, only non-inline constants should be transformed.
15400 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15401 if (ScalarVT == MVT::f32 &&
15402 TII->isInlineConstant(TrueNode->getValueAPF()) &&
15403 TII->isInlineConstant(FalseNode->getValueAPF()))
15404 return SDValue();
15405
15406 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15407 if (TrueNodeExpVal == INT_MIN)
15408 return SDValue();
15409 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15410 if (FalseNodeExpVal == INT_MIN)
15411 return SDValue();
15412
15413 SDLoc SL(N);
15414 SDValue SelectNode =
15415 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
15416 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
15417 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
15418
15419 LHS = TrueNode->isNegative()
15420 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
15421 : LHS;
15422
15423 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
15424 }
15425
15426 return SDValue();
15427 }
15428
performFMACombine(SDNode * N,DAGCombinerInfo & DCI) const15429 SDValue SITargetLowering::performFMACombine(SDNode *N,
15430 DAGCombinerInfo &DCI) const {
15431 SelectionDAG &DAG = DCI.DAG;
15432 EVT VT = N->getValueType(0);
15433 SDLoc SL(N);
15434
15435 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15436 return SDValue();
15437
15438 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15439 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15440 SDValue Op1 = N->getOperand(0);
15441 SDValue Op2 = N->getOperand(1);
15442 SDValue FMA = N->getOperand(2);
15443
15444 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15445 Op2.getOpcode() != ISD::FP_EXTEND)
15446 return SDValue();
15447
15448 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15449 // regardless of the denorm mode setting. Therefore,
15450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
15451 const TargetOptions &Options = DAG.getTarget().Options;
15452 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
15453 (N->getFlags().hasAllowContract() &&
15454 FMA->getFlags().hasAllowContract())) {
15455 Op1 = Op1.getOperand(0);
15456 Op2 = Op2.getOperand(0);
15457 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15458 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15459 return SDValue();
15460
15461 SDValue Vec1 = Op1.getOperand(0);
15462 SDValue Idx1 = Op1.getOperand(1);
15463 SDValue Vec2 = Op2.getOperand(0);
15464
15465 SDValue FMAOp1 = FMA.getOperand(0);
15466 SDValue FMAOp2 = FMA.getOperand(1);
15467 SDValue FMAAcc = FMA.getOperand(2);
15468
15469 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15470 FMAOp2.getOpcode() != ISD::FP_EXTEND)
15471 return SDValue();
15472
15473 FMAOp1 = FMAOp1.getOperand(0);
15474 FMAOp2 = FMAOp2.getOperand(0);
15475 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15476 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15477 return SDValue();
15478
15479 SDValue Vec3 = FMAOp1.getOperand(0);
15480 SDValue Vec4 = FMAOp2.getOperand(0);
15481 SDValue Idx2 = FMAOp1.getOperand(1);
15482
15483 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
15484 // Idx1 and Idx2 cannot be the same.
15485 Idx1 == Idx2)
15486 return SDValue();
15487
15488 if (Vec1 == Vec2 || Vec3 == Vec4)
15489 return SDValue();
15490
15491 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15492 return SDValue();
15493
15494 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15495 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
15496 DAG.getTargetConstant(0, SL, MVT::i1));
15497 }
15498 }
15499 return SDValue();
15500 }
15501
performSetCCCombine(SDNode * N,DAGCombinerInfo & DCI) const15502 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15503 DAGCombinerInfo &DCI) const {
15504 SelectionDAG &DAG = DCI.DAG;
15505 SDLoc SL(N);
15506
15507 SDValue LHS = N->getOperand(0);
15508 SDValue RHS = N->getOperand(1);
15509 EVT VT = LHS.getValueType();
15510 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15511
15512 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15513 if (!CRHS) {
15514 CRHS = dyn_cast<ConstantSDNode>(LHS);
15515 if (CRHS) {
15516 std::swap(LHS, RHS);
15517 CC = getSetCCSwappedOperands(CC);
15518 }
15519 }
15520
15521 if (CRHS) {
15522 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15523 isBoolSGPR(LHS.getOperand(0))) {
15524 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15525 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15526 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
15527 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
15528 if ((CRHS->isAllOnes() &&
15529 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
15530 (CRHS->isZero() &&
15531 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
15532 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15533 DAG.getAllOnesConstant(SL, MVT::i1));
15534 if ((CRHS->isAllOnes() &&
15535 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
15536 (CRHS->isZero() &&
15537 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
15538 return LHS.getOperand(0);
15539 }
15540
15541 const APInt &CRHSVal = CRHS->getAPIntValue();
15542 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15543 LHS.getOpcode() == ISD::SELECT &&
15544 isa<ConstantSDNode>(LHS.getOperand(1)) &&
15545 isa<ConstantSDNode>(LHS.getOperand(2)) &&
15546 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15547 isBoolSGPR(LHS.getOperand(0))) {
15548 // Given CT != FT:
15549 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15550 // setcc (select cc, CT, CF), CF, ne => cc
15551 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15552 // setcc (select cc, CT, CF), CT, eq => cc
15553 const APInt &CT = LHS.getConstantOperandAPInt(1);
15554 const APInt &CF = LHS.getConstantOperandAPInt(2);
15555
15556 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15557 (CT == CRHSVal && CC == ISD::SETNE))
15558 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15559 DAG.getAllOnesConstant(SL, MVT::i1));
15560 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15561 (CT == CRHSVal && CC == ISD::SETEQ))
15562 return LHS.getOperand(0);
15563 }
15564 }
15565
15566 if (VT != MVT::f32 && VT != MVT::f64 &&
15567 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15568 return SDValue();
15569
15570 // Match isinf/isfinite pattern
15571 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15572 // (fcmp one (fabs x), inf) -> (fp_class x,
15573 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15574 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15575 LHS.getOpcode() == ISD::FABS) {
15576 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15577 if (!CRHS)
15578 return SDValue();
15579
15580 const APFloat &APF = CRHS->getValueAPF();
15581 if (APF.isInfinity() && !APF.isNegative()) {
15582 const unsigned IsInfMask =
15583 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
15584 const unsigned IsFiniteMask =
15585 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
15586 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
15587 SIInstrFlags::P_SUBNORMAL;
15588 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15589 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15590 DAG.getConstant(Mask, SL, MVT::i32));
15591 }
15592 }
15593
15594 return SDValue();
15595 }
15596
15597 SDValue
performCvtF32UByteNCombine(SDNode * N,DAGCombinerInfo & DCI) const15598 SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15599 DAGCombinerInfo &DCI) const {
15600 SelectionDAG &DAG = DCI.DAG;
15601 SDLoc SL(N);
15602 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15603
15604 SDValue Src = N->getOperand(0);
15605 SDValue Shift = N->getOperand(0);
15606
15607 // TODO: Extend type shouldn't matter (assuming legal types).
15608 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15609 Shift = Shift.getOperand(0);
15610
15611 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15612 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15613 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15614 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15615 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15616 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15617 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15618 SDValue Shifted = DAG.getZExtOrTrunc(
15619 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15620
15621 unsigned ShiftOffset = 8 * Offset;
15622 if (Shift.getOpcode() == ISD::SHL)
15623 ShiftOffset -= C->getZExtValue();
15624 else
15625 ShiftOffset += C->getZExtValue();
15626
15627 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15628 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15629 MVT::f32, Shifted);
15630 }
15631 }
15632 }
15633
15634 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15635 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15636 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15637 // We simplified Src. If this node is not dead, visit it again so it is
15638 // folded properly.
15639 if (N->getOpcode() != ISD::DELETED_NODE)
15640 DCI.AddToWorklist(N);
15641 return SDValue(N, 0);
15642 }
15643
15644 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15645 if (SDValue DemandedSrc =
15646 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
15647 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15648
15649 return SDValue();
15650 }
15651
performClampCombine(SDNode * N,DAGCombinerInfo & DCI) const15652 SDValue SITargetLowering::performClampCombine(SDNode *N,
15653 DAGCombinerInfo &DCI) const {
15654 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15655 if (!CSrc)
15656 return SDValue();
15657
15658 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15659 const APFloat &F = CSrc->getValueAPF();
15660 APFloat Zero = APFloat::getZero(F.getSemantics());
15661 if (F < Zero ||
15662 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15663 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15664 }
15665
15666 APFloat One(F.getSemantics(), "1.0");
15667 if (F > One)
15668 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15669
15670 return SDValue(CSrc, 0);
15671 }
15672
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const15673 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
15674 DAGCombinerInfo &DCI) const {
15675 switch (N->getOpcode()) {
15676 case ISD::ADD:
15677 case ISD::SUB:
15678 case ISD::SHL:
15679 case ISD::SRL:
15680 case ISD::SRA:
15681 case ISD::AND:
15682 case ISD::OR:
15683 case ISD::XOR:
15684 case ISD::MUL:
15685 case ISD::SETCC:
15686 case ISD::SELECT:
15687 case ISD::SMIN:
15688 case ISD::SMAX:
15689 case ISD::UMIN:
15690 case ISD::UMAX:
15691 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15692 return Res;
15693 break;
15694 default:
15695 break;
15696 }
15697
15698 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15699 return SDValue();
15700
15701 switch (N->getOpcode()) {
15702 case ISD::ADD:
15703 return performAddCombine(N, DCI);
15704 case ISD::PTRADD:
15705 return performPtrAddCombine(N, DCI);
15706 case ISD::SUB:
15707 return performSubCombine(N, DCI);
15708 case ISD::UADDO_CARRY:
15709 case ISD::USUBO_CARRY:
15710 return performAddCarrySubCarryCombine(N, DCI);
15711 case ISD::FADD:
15712 return performFAddCombine(N, DCI);
15713 case ISD::FSUB:
15714 return performFSubCombine(N, DCI);
15715 case ISD::FDIV:
15716 return performFDivCombine(N, DCI);
15717 case ISD::FMUL:
15718 return performFMulCombine(N, DCI);
15719 case ISD::SETCC:
15720 return performSetCCCombine(N, DCI);
15721 case ISD::FMAXNUM:
15722 case ISD::FMINNUM:
15723 case ISD::FMAXNUM_IEEE:
15724 case ISD::FMINNUM_IEEE:
15725 case ISD::FMAXIMUM:
15726 case ISD::FMINIMUM:
15727 case ISD::FMAXIMUMNUM:
15728 case ISD::FMINIMUMNUM:
15729 case ISD::SMAX:
15730 case ISD::SMIN:
15731 case ISD::UMAX:
15732 case ISD::UMIN:
15733 case AMDGPUISD::FMIN_LEGACY:
15734 case AMDGPUISD::FMAX_LEGACY:
15735 return performMinMaxCombine(N, DCI);
15736 case ISD::FMA:
15737 return performFMACombine(N, DCI);
15738 case ISD::AND:
15739 return performAndCombine(N, DCI);
15740 case ISD::OR:
15741 return performOrCombine(N, DCI);
15742 case ISD::FSHR: {
15743 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15744 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15745 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15746 return matchPERM(N, DCI);
15747 }
15748 break;
15749 }
15750 case ISD::XOR:
15751 return performXorCombine(N, DCI);
15752 case ISD::ZERO_EXTEND:
15753 return performZeroExtendCombine(N, DCI);
15754 case ISD::SIGN_EXTEND_INREG:
15755 return performSignExtendInRegCombine(N, DCI);
15756 case AMDGPUISD::FP_CLASS:
15757 return performClassCombine(N, DCI);
15758 case ISD::FCANONICALIZE:
15759 return performFCanonicalizeCombine(N, DCI);
15760 case AMDGPUISD::RCP:
15761 return performRcpCombine(N, DCI);
15762 case ISD::FLDEXP:
15763 case AMDGPUISD::FRACT:
15764 case AMDGPUISD::RSQ:
15765 case AMDGPUISD::RCP_LEGACY:
15766 case AMDGPUISD::RCP_IFLAG:
15767 case AMDGPUISD::RSQ_CLAMP: {
15768 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15769 SDValue Src = N->getOperand(0);
15770 if (Src.isUndef())
15771 return Src;
15772 break;
15773 }
15774 case ISD::SINT_TO_FP:
15775 case ISD::UINT_TO_FP:
15776 return performUCharToFloatCombine(N, DCI);
15777 case ISD::FCOPYSIGN:
15778 return performFCopySignCombine(N, DCI);
15779 case AMDGPUISD::CVT_F32_UBYTE0:
15780 case AMDGPUISD::CVT_F32_UBYTE1:
15781 case AMDGPUISD::CVT_F32_UBYTE2:
15782 case AMDGPUISD::CVT_F32_UBYTE3:
15783 return performCvtF32UByteNCombine(N, DCI);
15784 case AMDGPUISD::FMED3:
15785 return performFMed3Combine(N, DCI);
15786 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15787 return performCvtPkRTZCombine(N, DCI);
15788 case AMDGPUISD::CLAMP:
15789 return performClampCombine(N, DCI);
15790 case ISD::SCALAR_TO_VECTOR: {
15791 SelectionDAG &DAG = DCI.DAG;
15792 EVT VT = N->getValueType(0);
15793
15794 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15795 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15796 SDLoc SL(N);
15797 SDValue Src = N->getOperand(0);
15798 EVT EltVT = Src.getValueType();
15799 if (EltVT != MVT::i16)
15800 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15801
15802 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15803 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15804 }
15805
15806 break;
15807 }
15808 case ISD::EXTRACT_VECTOR_ELT:
15809 return performExtractVectorEltCombine(N, DCI);
15810 case ISD::INSERT_VECTOR_ELT:
15811 return performInsertVectorEltCombine(N, DCI);
15812 case ISD::FP_ROUND:
15813 return performFPRoundCombine(N, DCI);
15814 case ISD::LOAD: {
15815 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15816 return Widened;
15817 [[fallthrough]];
15818 }
15819 default: {
15820 if (!DCI.isBeforeLegalize()) {
15821 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15822 return performMemSDNodeCombine(MemNode, DCI);
15823 }
15824
15825 break;
15826 }
15827 }
15828
15829 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
15830 }
15831
15832 /// Helper function for adjustWritemask
SubIdx2Lane(unsigned Idx)15833 static unsigned SubIdx2Lane(unsigned Idx) {
15834 switch (Idx) {
15835 default:
15836 return ~0u;
15837 case AMDGPU::sub0:
15838 return 0;
15839 case AMDGPU::sub1:
15840 return 1;
15841 case AMDGPU::sub2:
15842 return 2;
15843 case AMDGPU::sub3:
15844 return 3;
15845 case AMDGPU::sub4:
15846 return 4; // Possible with TFE/LWE
15847 }
15848 }
15849
15850 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
adjustWritemask(MachineSDNode * & Node,SelectionDAG & DAG) const15851 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15852 SelectionDAG &DAG) const {
15853 unsigned Opcode = Node->getMachineOpcode();
15854
15855 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15856 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15857 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15858 return Node; // not implemented for D16
15859
15860 SDNode *Users[5] = {nullptr};
15861 unsigned Lane = 0;
15862 unsigned DmaskIdx =
15863 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15864 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15865 unsigned NewDmask = 0;
15866 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15867 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15868 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15869 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
15870 unsigned TFCLane = 0;
15871 bool HasChain = Node->getNumValues() > 1;
15872
15873 if (OldDmask == 0) {
15874 // These are folded out, but on the chance it happens don't assert.
15875 return Node;
15876 }
15877
15878 unsigned OldBitsSet = llvm::popcount(OldDmask);
15879 // Work out which is the TFE/LWE lane if that is enabled.
15880 if (UsesTFC) {
15881 TFCLane = OldBitsSet;
15882 }
15883
15884 // Try to figure out the used register components
15885 for (SDUse &Use : Node->uses()) {
15886
15887 // Don't look at users of the chain.
15888 if (Use.getResNo() != 0)
15889 continue;
15890
15891 SDNode *User = Use.getUser();
15892
15893 // Abort if we can't understand the usage
15894 if (!User->isMachineOpcode() ||
15895 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15896 return Node;
15897
15898 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15899 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15900 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15901 // set, etc.
15902 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15903 if (Lane == ~0u)
15904 return Node;
15905
15906 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15907 if (UsesTFC && Lane == TFCLane) {
15908 Users[Lane] = User;
15909 } else {
15910 // Set which texture component corresponds to the lane.
15911 unsigned Comp;
15912 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15913 Comp = llvm::countr_zero(Dmask);
15914 Dmask &= ~(1 << Comp);
15915 }
15916
15917 // Abort if we have more than one user per component.
15918 if (Users[Lane])
15919 return Node;
15920
15921 Users[Lane] = User;
15922 NewDmask |= 1 << Comp;
15923 }
15924 }
15925
15926 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15927 bool NoChannels = !NewDmask;
15928 if (NoChannels) {
15929 if (!UsesTFC) {
15930 // No uses of the result and not using TFC. Then do nothing.
15931 return Node;
15932 }
15933 // If the original dmask has one channel - then nothing to do
15934 if (OldBitsSet == 1)
15935 return Node;
15936 // Use an arbitrary dmask - required for the instruction to work
15937 NewDmask = 1;
15938 }
15939 // Abort if there's no change
15940 if (NewDmask == OldDmask)
15941 return Node;
15942
15943 unsigned BitsSet = llvm::popcount(NewDmask);
15944
15945 // Check for TFE or LWE - increase the number of channels by one to account
15946 // for the extra return value
15947 // This will need adjustment for D16 if this is also included in
15948 // adjustWriteMask (this function) but at present D16 are excluded.
15949 unsigned NewChannels = BitsSet + UsesTFC;
15950
15951 int NewOpcode =
15952 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15953 assert(NewOpcode != -1 &&
15954 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15955 "failed to find equivalent MIMG op");
15956
15957 // Adjust the writemask in the node
15958 SmallVector<SDValue, 12> Ops;
15959 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
15960 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15961 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
15962
15963 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15964
15965 MVT ResultVT = NewChannels == 1
15966 ? SVT
15967 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
15968 : NewChannels == 5 ? 8
15969 : NewChannels);
15970 SDVTList NewVTList =
15971 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15972
15973 MachineSDNode *NewNode =
15974 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15975
15976 if (HasChain) {
15977 // Update chain.
15978 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15979 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15980 }
15981
15982 if (NewChannels == 1) {
15983 assert(Node->hasNUsesOfValue(1, 0));
15984 SDNode *Copy =
15985 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15986 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15987 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15988 return nullptr;
15989 }
15990
15991 // Update the users of the node with the new indices
15992 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15993 SDNode *User = Users[i];
15994 if (!User) {
15995 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15996 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15997 if (i || !NoChannels)
15998 continue;
15999 } else {
16000 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16001 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16002 if (NewUser != User) {
16003 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16004 DAG.RemoveDeadNode(User);
16005 }
16006 }
16007
16008 switch (Idx) {
16009 default:
16010 break;
16011 case AMDGPU::sub0:
16012 Idx = AMDGPU::sub1;
16013 break;
16014 case AMDGPU::sub1:
16015 Idx = AMDGPU::sub2;
16016 break;
16017 case AMDGPU::sub2:
16018 Idx = AMDGPU::sub3;
16019 break;
16020 case AMDGPU::sub3:
16021 Idx = AMDGPU::sub4;
16022 break;
16023 }
16024 }
16025
16026 DAG.RemoveDeadNode(Node);
16027 return nullptr;
16028 }
16029
isFrameIndexOp(SDValue Op)16030 static bool isFrameIndexOp(SDValue Op) {
16031 if (Op.getOpcode() == ISD::AssertZext)
16032 Op = Op.getOperand(0);
16033
16034 return isa<FrameIndexSDNode>(Op);
16035 }
16036
16037 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
16038 /// with frame index operands.
16039 /// LLVM assumes that inputs are to these instructions are registers.
16040 SDNode *
legalizeTargetIndependentNode(SDNode * Node,SelectionDAG & DAG) const16041 SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
16042 SelectionDAG &DAG) const {
16043 if (Node->getOpcode() == ISD::CopyToReg) {
16044 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16045 SDValue SrcVal = Node->getOperand(2);
16046
16047 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16048 // to try understanding copies to physical registers.
16049 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16050 SDLoc SL(Node);
16051 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16052 SDValue VReg = DAG.getRegister(
16053 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16054
16055 SDNode *Glued = Node->getGluedNode();
16056 SDValue ToVReg = DAG.getCopyToReg(
16057 Node->getOperand(0), SL, VReg, SrcVal,
16058 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16059 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16060 VReg, ToVReg.getValue(1));
16061 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16062 DAG.RemoveDeadNode(Node);
16063 return ToResultReg.getNode();
16064 }
16065 }
16066
16067 SmallVector<SDValue, 8> Ops;
16068 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16069 if (!isFrameIndexOp(Node->getOperand(i))) {
16070 Ops.push_back(Node->getOperand(i));
16071 continue;
16072 }
16073
16074 SDLoc DL(Node);
16075 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16076 Node->getOperand(i).getValueType(),
16077 Node->getOperand(i)),
16078 0));
16079 }
16080
16081 return DAG.UpdateNodeOperands(Node, Ops);
16082 }
16083
16084 /// Fold the instructions after selecting them.
16085 /// Returns null if users were already updated.
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const16086 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
16087 SelectionDAG &DAG) const {
16088 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16089 unsigned Opcode = Node->getMachineOpcode();
16090
16091 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16092 !TII->isGather4(Opcode) &&
16093 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16094 return adjustWritemask(Node, DAG);
16095 }
16096
16097 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16098 legalizeTargetIndependentNode(Node, DAG);
16099 return Node;
16100 }
16101
16102 switch (Opcode) {
16103 case AMDGPU::V_DIV_SCALE_F32_e64:
16104 case AMDGPU::V_DIV_SCALE_F64_e64: {
16105 // Satisfy the operand register constraint when one of the inputs is
16106 // undefined. Ordinarily each undef value will have its own implicit_def of
16107 // a vreg, so force these to use a single register.
16108 SDValue Src0 = Node->getOperand(1);
16109 SDValue Src1 = Node->getOperand(3);
16110 SDValue Src2 = Node->getOperand(5);
16111
16112 if ((Src0.isMachineOpcode() &&
16113 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16114 (Src0 == Src1 || Src0 == Src2))
16115 break;
16116
16117 MVT VT = Src0.getValueType().getSimpleVT();
16118 const TargetRegisterClass *RC =
16119 getRegClassFor(VT, Src0.getNode()->isDivergent());
16120
16121 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16122 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16123
16124 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16125 Src0, SDValue());
16126
16127 // src0 must be the same register as src1 or src2, even if the value is
16128 // undefined, so make sure we don't violate this constraint.
16129 if (Src0.isMachineOpcode() &&
16130 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16131 if (Src1.isMachineOpcode() &&
16132 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16133 Src0 = Src1;
16134 else if (Src2.isMachineOpcode() &&
16135 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16136 Src0 = Src2;
16137 else {
16138 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16139 Src0 = UndefReg;
16140 Src1 = UndefReg;
16141 }
16142 } else
16143 break;
16144
16145 SmallVector<SDValue, 9> Ops(Node->ops());
16146 Ops[1] = Src0;
16147 Ops[3] = Src1;
16148 Ops[5] = Src2;
16149 Ops.push_back(ImpDef.getValue(1));
16150 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16151 }
16152 default:
16153 break;
16154 }
16155
16156 return Node;
16157 }
16158
16159 // Any MIMG instructions that use tfe or lwe require an initialization of the
16160 // result register that will be written in the case of a memory access failure.
16161 // The required code is also added to tie this init code to the result of the
16162 // img instruction.
AddMemOpInit(MachineInstr & MI) const16163 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
16164 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16165 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16166 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16167 MachineBasicBlock &MBB = *MI.getParent();
16168
16169 int DstIdx =
16170 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16171 unsigned InitIdx = 0;
16172
16173 if (TII->isImage(MI)) {
16174 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16175 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16176 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16177
16178 if (!TFE && !LWE) // intersect_ray
16179 return;
16180
16181 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16182 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16183 unsigned D16Val = D16 ? D16->getImm() : 0;
16184
16185 if (!TFEVal && !LWEVal)
16186 return;
16187
16188 // At least one of TFE or LWE are non-zero
16189 // We have to insert a suitable initialization of the result value and
16190 // tie this to the dest of the image instruction.
16191
16192 // Calculate which dword we have to initialize to 0.
16193 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16194
16195 // check that dmask operand is found.
16196 assert(MO_Dmask && "Expected dmask operand in instruction");
16197
16198 unsigned dmask = MO_Dmask->getImm();
16199 // Determine the number of active lanes taking into account the
16200 // Gather4 special case
16201 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16202
16203 bool Packed = !Subtarget->hasUnpackedD16VMem();
16204
16205 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16206
16207 // Abandon attempt if the dst size isn't large enough
16208 // - this is in fact an error but this is picked up elsewhere and
16209 // reported correctly.
16210 uint32_t DstSize =
16211 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16212 if (DstSize < InitIdx)
16213 return;
16214 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16215 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16216 } else {
16217 return;
16218 }
16219
16220 const DebugLoc &DL = MI.getDebugLoc();
16221
16222 // Create a register for the initialization value.
16223 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16224 unsigned NewDst = 0; // Final initialized value will be in here
16225
16226 // If PRTStrictNull feature is enabled (the default) then initialize
16227 // all the result registers to 0, otherwise just the error indication
16228 // register (VGPRn+1)
16229 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16230 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16231
16232 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16233 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16234 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16235 // Initialize dword
16236 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16237 // clang-format off
16238 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16239 .addImm(0);
16240 // clang-format on
16241 // Insert into the super-reg
16242 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16243 .addReg(PrevDst)
16244 .addReg(SubReg)
16245 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
16246
16247 PrevDst = NewDst;
16248 }
16249
16250 // Add as an implicit operand
16251 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
16252
16253 // Tie the just added implicit operand to the dst
16254 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
16255 }
16256
16257 /// Assign the register class depending on the number of
16258 /// bits set in the writemask
AdjustInstrPostInstrSelection(MachineInstr & MI,SDNode * Node) const16259 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
16260 SDNode *Node) const {
16261 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16262
16263 MachineFunction *MF = MI.getParent()->getParent();
16264 MachineRegisterInfo &MRI = MF->getRegInfo();
16265 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
16266
16267 if (TII->isVOP3(MI.getOpcode())) {
16268 // Make sure constant bus requirements are respected.
16269 TII->legalizeOperandsVOP3(MRI, MI);
16270
16271 // Prefer VGPRs over AGPRs in mAI instructions where possible.
16272 // This saves a chain-copy of registers and better balance register
16273 // use between vgpr and agpr as agpr tuples tend to be big.
16274 if (!MI.getDesc().operands().empty()) {
16275 unsigned Opc = MI.getOpcode();
16276 bool HasAGPRs = Info->mayNeedAGPRs();
16277 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16278 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
16279 for (auto I :
16280 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
16281 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
16282 if (I == -1)
16283 break;
16284 if ((I == Src2Idx) && (HasAGPRs))
16285 break;
16286 MachineOperand &Op = MI.getOperand(I);
16287 if (!Op.isReg() || !Op.getReg().isVirtual())
16288 continue;
16289 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
16290 if (!TRI->hasAGPRs(RC))
16291 continue;
16292 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
16293 if (!Src || !Src->isCopy() ||
16294 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
16295 continue;
16296 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
16297 // All uses of agpr64 and agpr32 can also accept vgpr except for
16298 // v_accvgpr_read, but we do not produce agpr reads during selection,
16299 // so no use checks are needed.
16300 MRI.setRegClass(Op.getReg(), NewRC);
16301 }
16302
16303 if (TII->isMAI(MI)) {
16304 // The ordinary src0, src1, src2 were legalized above.
16305 //
16306 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16307 // as a separate instruction.
16308 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16309 AMDGPU::OpName::scale_src0);
16310 if (Src0Idx != -1) {
16311 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16312 AMDGPU::OpName::scale_src1);
16313 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
16314 TII->usesConstantBus(MRI, MI, Src1Idx))
16315 TII->legalizeOpWithMove(MI, Src1Idx);
16316 }
16317 }
16318
16319 if (!HasAGPRs)
16320 return;
16321
16322 // Resolve the rest of AV operands to AGPRs.
16323 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
16324 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16325 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
16326 if (TRI->isVectorSuperClass(RC)) {
16327 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
16328 MRI.setRegClass(Src2->getReg(), NewRC);
16329 if (Src2->isTied())
16330 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
16331 }
16332 }
16333 }
16334 }
16335
16336 return;
16337 }
16338
16339 if (TII->isImage(MI))
16340 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
16341 }
16342
buildSMovImm32(SelectionDAG & DAG,const SDLoc & DL,uint64_t Val)16343 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
16344 uint64_t Val) {
16345 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
16346 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
16347 }
16348
wrapAddr64Rsrc(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr) const16349 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
16350 const SDLoc &DL,
16351 SDValue Ptr) const {
16352 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16353
16354 // Build the half of the subregister with the constants before building the
16355 // full 128-bit register. If we are building multiple resource descriptors,
16356 // this will allow CSEing of the 2-component register.
16357 const SDValue Ops0[] = {
16358 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
16359 buildSMovImm32(DAG, DL, 0),
16360 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16361 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
16362 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
16363
16364 SDValue SubRegHi = SDValue(
16365 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
16366
16367 // Combine the constants and the pointer.
16368 const SDValue Ops1[] = {
16369 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
16370 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
16371 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
16372
16373 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
16374 }
16375
16376 /// Return a resource descriptor with the 'Add TID' bit enabled
16377 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16378 /// of the resource descriptor) to create an offset, which is added to
16379 /// the resource pointer.
buildRSRC(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr,uint32_t RsrcDword1,uint64_t RsrcDword2And3) const16380 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
16381 SDValue Ptr, uint32_t RsrcDword1,
16382 uint64_t RsrcDword2And3) const {
16383 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
16384 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
16385 if (RsrcDword1) {
16386 PtrHi =
16387 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
16388 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
16389 0);
16390 }
16391
16392 SDValue DataLo =
16393 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16394 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
16395
16396 const SDValue Ops[] = {
16397 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
16398 PtrLo,
16399 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16400 PtrHi,
16401 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
16402 DataLo,
16403 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
16404 DataHi,
16405 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
16406
16407 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
16408 }
16409
16410 //===----------------------------------------------------------------------===//
16411 // SI Inline Assembly Support
16412 //===----------------------------------------------------------------------===//
16413
16414 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI_,StringRef Constraint,MVT VT) const16415 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
16416 StringRef Constraint,
16417 MVT VT) const {
16418 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16419
16420 const TargetRegisterClass *RC = nullptr;
16421 if (Constraint.size() == 1) {
16422 const unsigned BitWidth = VT.getSizeInBits();
16423 switch (Constraint[0]) {
16424 default:
16425 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16426 case 's':
16427 case 'r':
16428 switch (BitWidth) {
16429 case 16:
16430 RC = &AMDGPU::SReg_32RegClass;
16431 break;
16432 case 64:
16433 RC = &AMDGPU::SGPR_64RegClass;
16434 break;
16435 default:
16436 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
16437 if (!RC)
16438 return std::pair(0U, nullptr);
16439 break;
16440 }
16441 break;
16442 case 'v':
16443 switch (BitWidth) {
16444 case 16:
16445 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16446 : &AMDGPU::VGPR_32RegClass;
16447 break;
16448 default:
16449 RC = TRI->getVGPRClassForBitWidth(BitWidth);
16450 if (!RC)
16451 return std::pair(0U, nullptr);
16452 break;
16453 }
16454 break;
16455 case 'a':
16456 if (!Subtarget->hasMAIInsts())
16457 break;
16458 switch (BitWidth) {
16459 case 16:
16460 RC = &AMDGPU::AGPR_32RegClass;
16461 break;
16462 default:
16463 RC = TRI->getAGPRClassForBitWidth(BitWidth);
16464 if (!RC)
16465 return std::pair(0U, nullptr);
16466 break;
16467 }
16468 break;
16469 }
16470 // We actually support i128, i16 and f16 as inline parameters
16471 // even if they are not reported as legal
16472 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
16473 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
16474 return std::pair(0U, RC);
16475 }
16476
16477 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
16478 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
16479 if (RegName.consume_front("v")) {
16480 RC = &AMDGPU::VGPR_32RegClass;
16481 } else if (RegName.consume_front("s")) {
16482 RC = &AMDGPU::SGPR_32RegClass;
16483 } else if (RegName.consume_front("a")) {
16484 RC = &AMDGPU::AGPR_32RegClass;
16485 }
16486
16487 if (RC) {
16488 uint32_t Idx;
16489 if (RegName.consume_front("[")) {
16490 uint32_t End;
16491 bool Failed = RegName.consumeInteger(10, Idx);
16492 Failed |= !RegName.consume_front(":");
16493 Failed |= RegName.consumeInteger(10, End);
16494 Failed |= !RegName.consume_back("]");
16495 if (!Failed) {
16496 uint32_t Width = (End - Idx + 1) * 32;
16497 // Prohibit constraints for register ranges with a width that does not
16498 // match the required type.
16499 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16500 return std::pair(0U, nullptr);
16501 MCRegister Reg = RC->getRegister(Idx);
16502 if (SIRegisterInfo::isVGPRClass(RC))
16503 RC = TRI->getVGPRClassForBitWidth(Width);
16504 else if (SIRegisterInfo::isSGPRClass(RC))
16505 RC = TRI->getSGPRClassForBitWidth(Width);
16506 else if (SIRegisterInfo::isAGPRClass(RC))
16507 RC = TRI->getAGPRClassForBitWidth(Width);
16508 if (RC) {
16509 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16510 if (!Reg) {
16511 // The register class does not contain the requested register,
16512 // e.g., because it is an SGPR pair that would violate alignment
16513 // requirements.
16514 return std::pair(0U, nullptr);
16515 }
16516 return std::pair(Reg, RC);
16517 }
16518 }
16519 } else {
16520 // Check for lossy scalar/vector conversions.
16521 if (VT.isVector() && VT.getSizeInBits() != 32)
16522 return std::pair(0U, nullptr);
16523 bool Failed = RegName.getAsInteger(10, Idx);
16524 if (!Failed && Idx < RC->getNumRegs())
16525 return std::pair(RC->getRegister(Idx), RC);
16526 }
16527 }
16528 }
16529
16530 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16531 if (Ret.first)
16532 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
16533
16534 return Ret;
16535 }
16536
isImmConstraint(StringRef Constraint)16537 static bool isImmConstraint(StringRef Constraint) {
16538 if (Constraint.size() == 1) {
16539 switch (Constraint[0]) {
16540 default:
16541 break;
16542 case 'I':
16543 case 'J':
16544 case 'A':
16545 case 'B':
16546 case 'C':
16547 return true;
16548 }
16549 } else if (Constraint == "DA" || Constraint == "DB") {
16550 return true;
16551 }
16552 return false;
16553 }
16554
16555 SITargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const16556 SITargetLowering::getConstraintType(StringRef Constraint) const {
16557 if (Constraint.size() == 1) {
16558 switch (Constraint[0]) {
16559 default:
16560 break;
16561 case 's':
16562 case 'v':
16563 case 'a':
16564 return C_RegisterClass;
16565 }
16566 }
16567 if (isImmConstraint(Constraint)) {
16568 return C_Other;
16569 }
16570 return TargetLowering::getConstraintType(Constraint);
16571 }
16572
clearUnusedBits(uint64_t Val,unsigned Size)16573 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16574 if (!AMDGPU::isInlinableIntLiteral(Val)) {
16575 Val = Val & maskTrailingOnes<uint64_t>(Size);
16576 }
16577 return Val;
16578 }
16579
LowerAsmOperandForConstraint(SDValue Op,StringRef Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const16580 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16581 StringRef Constraint,
16582 std::vector<SDValue> &Ops,
16583 SelectionDAG &DAG) const {
16584 if (isImmConstraint(Constraint)) {
16585 uint64_t Val;
16586 if (getAsmOperandConstVal(Op, Val) &&
16587 checkAsmConstraintVal(Op, Constraint, Val)) {
16588 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
16589 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
16590 }
16591 } else {
16592 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16593 }
16594 }
16595
getAsmOperandConstVal(SDValue Op,uint64_t & Val) const16596 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
16597 unsigned Size = Op.getScalarValueSizeInBits();
16598 if (Size > 64)
16599 return false;
16600
16601 if (Size == 16 && !Subtarget->has16BitInsts())
16602 return false;
16603
16604 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16605 Val = C->getSExtValue();
16606 return true;
16607 }
16608 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
16609 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16610 return true;
16611 }
16612 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
16613 if (Size != 16 || Op.getNumOperands() != 2)
16614 return false;
16615 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
16616 return false;
16617 if (ConstantSDNode *C = V->getConstantSplatNode()) {
16618 Val = C->getSExtValue();
16619 return true;
16620 }
16621 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16622 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16623 return true;
16624 }
16625 }
16626
16627 return false;
16628 }
16629
checkAsmConstraintVal(SDValue Op,StringRef Constraint,uint64_t Val) const16630 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
16631 uint64_t Val) const {
16632 if (Constraint.size() == 1) {
16633 switch (Constraint[0]) {
16634 case 'I':
16635 return AMDGPU::isInlinableIntLiteral(Val);
16636 case 'J':
16637 return isInt<16>(Val);
16638 case 'A':
16639 return checkAsmConstraintValA(Op, Val);
16640 case 'B':
16641 return isInt<32>(Val);
16642 case 'C':
16643 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16644 AMDGPU::isInlinableIntLiteral(Val);
16645 default:
16646 break;
16647 }
16648 } else if (Constraint.size() == 2) {
16649 if (Constraint == "DA") {
16650 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16651 int64_t LoBits = static_cast<int32_t>(Val);
16652 return checkAsmConstraintValA(Op, HiBits, 32) &&
16653 checkAsmConstraintValA(Op, LoBits, 32);
16654 }
16655 if (Constraint == "DB") {
16656 return true;
16657 }
16658 }
16659 llvm_unreachable("Invalid asm constraint");
16660 }
16661
checkAsmConstraintValA(SDValue Op,uint64_t Val,unsigned MaxSize) const16662 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
16663 unsigned MaxSize) const {
16664 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16665 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16666 if (Size == 16) {
16667 MVT VT = Op.getSimpleValueType();
16668 switch (VT.SimpleTy) {
16669 default:
16670 return false;
16671 case MVT::i16:
16672 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16673 case MVT::f16:
16674 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16675 case MVT::bf16:
16676 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16677 case MVT::v2i16:
16678 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16679 case MVT::v2f16:
16680 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16681 case MVT::v2bf16:
16682 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16683 }
16684 }
16685 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16686 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16687 return true;
16688 return false;
16689 }
16690
getAlignedAGPRClassID(unsigned UnalignedClassID)16691 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16692 switch (UnalignedClassID) {
16693 case AMDGPU::VReg_64RegClassID:
16694 return AMDGPU::VReg_64_Align2RegClassID;
16695 case AMDGPU::VReg_96RegClassID:
16696 return AMDGPU::VReg_96_Align2RegClassID;
16697 case AMDGPU::VReg_128RegClassID:
16698 return AMDGPU::VReg_128_Align2RegClassID;
16699 case AMDGPU::VReg_160RegClassID:
16700 return AMDGPU::VReg_160_Align2RegClassID;
16701 case AMDGPU::VReg_192RegClassID:
16702 return AMDGPU::VReg_192_Align2RegClassID;
16703 case AMDGPU::VReg_224RegClassID:
16704 return AMDGPU::VReg_224_Align2RegClassID;
16705 case AMDGPU::VReg_256RegClassID:
16706 return AMDGPU::VReg_256_Align2RegClassID;
16707 case AMDGPU::VReg_288RegClassID:
16708 return AMDGPU::VReg_288_Align2RegClassID;
16709 case AMDGPU::VReg_320RegClassID:
16710 return AMDGPU::VReg_320_Align2RegClassID;
16711 case AMDGPU::VReg_352RegClassID:
16712 return AMDGPU::VReg_352_Align2RegClassID;
16713 case AMDGPU::VReg_384RegClassID:
16714 return AMDGPU::VReg_384_Align2RegClassID;
16715 case AMDGPU::VReg_512RegClassID:
16716 return AMDGPU::VReg_512_Align2RegClassID;
16717 case AMDGPU::VReg_1024RegClassID:
16718 return AMDGPU::VReg_1024_Align2RegClassID;
16719 case AMDGPU::AReg_64RegClassID:
16720 return AMDGPU::AReg_64_Align2RegClassID;
16721 case AMDGPU::AReg_96RegClassID:
16722 return AMDGPU::AReg_96_Align2RegClassID;
16723 case AMDGPU::AReg_128RegClassID:
16724 return AMDGPU::AReg_128_Align2RegClassID;
16725 case AMDGPU::AReg_160RegClassID:
16726 return AMDGPU::AReg_160_Align2RegClassID;
16727 case AMDGPU::AReg_192RegClassID:
16728 return AMDGPU::AReg_192_Align2RegClassID;
16729 case AMDGPU::AReg_256RegClassID:
16730 return AMDGPU::AReg_256_Align2RegClassID;
16731 case AMDGPU::AReg_512RegClassID:
16732 return AMDGPU::AReg_512_Align2RegClassID;
16733 case AMDGPU::AReg_1024RegClassID:
16734 return AMDGPU::AReg_1024_Align2RegClassID;
16735 default:
16736 return -1;
16737 }
16738 }
16739
16740 // Figure out which registers should be reserved for stack access. Only after
16741 // the function is legalized do we know all of the non-spill stack objects or if
16742 // calls are present.
finalizeLowering(MachineFunction & MF) const16743 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
16744 MachineRegisterInfo &MRI = MF.getRegInfo();
16745 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16746 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16747 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16748 const SIInstrInfo *TII = ST.getInstrInfo();
16749
16750 if (Info->isEntryFunction()) {
16751 // Callable functions have fixed registers used for stack access.
16752 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
16753 }
16754
16755 // TODO: Move this logic to getReservedRegs()
16756 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16757 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16758 Register SReg = ST.isWave32()
16759 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16760 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16761 &AMDGPU::SGPR_64RegClass);
16762 Info->setSGPRForEXECCopy(SReg);
16763
16764 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16765 Info->getStackPtrOffsetReg()));
16766 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16767 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16768
16769 // We need to worry about replacing the default register with itself in case
16770 // of MIR testcases missing the MFI.
16771 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16772 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16773
16774 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16775 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16776
16777 Info->limitOccupancy(MF);
16778
16779 if (ST.isWave32() && !MF.empty()) {
16780 for (auto &MBB : MF) {
16781 for (auto &MI : MBB) {
16782 TII->fixImplicitOperands(MI);
16783 }
16784 }
16785 }
16786
16787 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16788 // classes if required. Ideally the register class constraints would differ
16789 // per-subtarget, but there's no easy way to achieve that right now. This is
16790 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16791 // from using them as the register class for legal types.
16792 if (ST.needsAlignedVGPRs()) {
16793 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16794 const Register Reg = Register::index2VirtReg(I);
16795 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16796 if (!RC)
16797 continue;
16798 int NewClassID = getAlignedAGPRClassID(RC->getID());
16799 if (NewClassID != -1)
16800 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16801 }
16802 }
16803
16804 TargetLoweringBase::finalizeLowering(MF);
16805 }
16806
computeKnownBitsForTargetNode(const SDValue Op,KnownBits & Known,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const16807 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16808 KnownBits &Known,
16809 const APInt &DemandedElts,
16810 const SelectionDAG &DAG,
16811 unsigned Depth) const {
16812 Known.resetAll();
16813 unsigned Opc = Op.getOpcode();
16814 switch (Opc) {
16815 case ISD::INTRINSIC_WO_CHAIN: {
16816 unsigned IID = Op.getConstantOperandVal(0);
16817 switch (IID) {
16818 case Intrinsic::amdgcn_mbcnt_lo:
16819 case Intrinsic::amdgcn_mbcnt_hi: {
16820 const GCNSubtarget &ST =
16821 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
16822 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16823 // most 31 + src1.
16824 Known.Zero.setBitsFrom(
16825 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16826 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16827 Known = KnownBits::add(Known, Known2);
16828 return;
16829 }
16830 }
16831 break;
16832 }
16833 }
16834 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
16835 Op, Known, DemandedElts, DAG, Depth);
16836 }
16837
computeKnownBitsForFrameIndex(const int FI,KnownBits & Known,const MachineFunction & MF) const16838 void SITargetLowering::computeKnownBitsForFrameIndex(
16839 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16840 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
16841
16842 // Set the high bits to zero based on the maximum allowed scratch size per
16843 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16844 // calculation won't overflow, so assume the sign bit is never set.
16845 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16846 }
16847
knownBitsForWorkitemID(const GCNSubtarget & ST,GISelValueTracking & VT,KnownBits & Known,unsigned Dim)16848 static void knownBitsForWorkitemID(const GCNSubtarget &ST,
16849 GISelValueTracking &VT, KnownBits &Known,
16850 unsigned Dim) {
16851 unsigned MaxValue =
16852 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
16853 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16854 }
16855
computeKnownBitsForTargetInstr(GISelValueTracking & VT,Register R,KnownBits & Known,const APInt & DemandedElts,const MachineRegisterInfo & MRI,unsigned Depth) const16856 void SITargetLowering::computeKnownBitsForTargetInstr(
16857 GISelValueTracking &VT, Register R, KnownBits &Known,
16858 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
16859 unsigned Depth) const {
16860 const MachineInstr *MI = MRI.getVRegDef(R);
16861 switch (MI->getOpcode()) {
16862 case AMDGPU::G_INTRINSIC:
16863 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16864 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16865 switch (IID) {
16866 case Intrinsic::amdgcn_workitem_id_x:
16867 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
16868 break;
16869 case Intrinsic::amdgcn_workitem_id_y:
16870 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
16871 break;
16872 case Intrinsic::amdgcn_workitem_id_z:
16873 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
16874 break;
16875 case Intrinsic::amdgcn_mbcnt_lo:
16876 case Intrinsic::amdgcn_mbcnt_hi: {
16877 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16878 // most 31 + src1.
16879 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16880 ? getSubtarget()->getWavefrontSizeLog2()
16881 : 5);
16882 KnownBits Known2;
16883 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16884 Depth + 1);
16885 Known = KnownBits::add(Known, Known2);
16886 break;
16887 }
16888 case Intrinsic::amdgcn_groupstaticsize: {
16889 // We can report everything over the maximum size as 0. We can't report
16890 // based on the actual size because we don't know if it's accurate or not
16891 // at any given point.
16892 Known.Zero.setHighBits(
16893 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16894 break;
16895 }
16896 }
16897 break;
16898 }
16899 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16900 Known.Zero.setHighBits(24);
16901 break;
16902 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16903 Known.Zero.setHighBits(16);
16904 break;
16905 case AMDGPU::G_AMDGPU_SMED3:
16906 case AMDGPU::G_AMDGPU_UMED3: {
16907 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16908
16909 KnownBits Known2;
16910 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16911 if (Known2.isUnknown())
16912 break;
16913
16914 KnownBits Known1;
16915 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16916 if (Known1.isUnknown())
16917 break;
16918
16919 KnownBits Known0;
16920 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16921 if (Known0.isUnknown())
16922 break;
16923
16924 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16925 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16926 Known.One = Known0.One & Known1.One & Known2.One;
16927 break;
16928 }
16929 }
16930 }
16931
computeKnownAlignForTargetInstr(GISelValueTracking & VT,Register R,const MachineRegisterInfo & MRI,unsigned Depth) const16932 Align SITargetLowering::computeKnownAlignForTargetInstr(
16933 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
16934 unsigned Depth) const {
16935 const MachineInstr *MI = MRI.getVRegDef(R);
16936 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16937 // FIXME: Can this move to generic code? What about the case where the call
16938 // site specifies a lower alignment?
16939 Intrinsic::ID IID = GI->getIntrinsicID();
16940 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
16941 AttributeList Attrs =
16942 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
16943 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16944 return *RetAlign;
16945 }
16946 return Align(1);
16947 }
16948
getPrefLoopAlignment(MachineLoop * ML) const16949 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16950 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
16951 const Align CacheLineAlign = Align(64);
16952
16953 // Pre-GFX10 target did not benefit from loop alignment
16954 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16955 getSubtarget()->hasInstFwdPrefetchBug())
16956 return PrefAlign;
16957
16958 // On GFX10 I$ is 4 x 64 bytes cache lines.
16959 // By default prefetcher keeps one cache line behind and reads two ahead.
16960 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16961 // behind and one ahead.
16962 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16963 // If loop fits 64 bytes it always spans no more than two cache lines and
16964 // does not need an alignment.
16965 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16966 // Else if loop is less or equal 192 bytes we need two lines behind.
16967
16968 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16969 const MachineBasicBlock *Header = ML->getHeader();
16970 if (Header->getAlignment() != PrefAlign)
16971 return Header->getAlignment(); // Already processed.
16972
16973 unsigned LoopSize = 0;
16974 for (const MachineBasicBlock *MBB : ML->blocks()) {
16975 // If inner loop block is aligned assume in average half of the alignment
16976 // size to be added as nops.
16977 if (MBB != Header)
16978 LoopSize += MBB->getAlignment().value() / 2;
16979
16980 for (const MachineInstr &MI : *MBB) {
16981 LoopSize += TII->getInstSizeInBytes(MI);
16982 if (LoopSize > 192)
16983 return PrefAlign;
16984 }
16985 }
16986
16987 if (LoopSize <= 64)
16988 return PrefAlign;
16989
16990 if (LoopSize <= 128)
16991 return CacheLineAlign;
16992
16993 // If any of parent loops is surrounded by prefetch instructions do not
16994 // insert new for inner loop, which would reset parent's settings.
16995 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16996 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16997 auto I = Exit->getFirstNonDebugInstr();
16998 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16999 return CacheLineAlign;
17000 }
17001 }
17002
17003 MachineBasicBlock *Pre = ML->getLoopPreheader();
17004 MachineBasicBlock *Exit = ML->getExitBlock();
17005
17006 if (Pre && Exit) {
17007 auto PreTerm = Pre->getFirstTerminator();
17008 if (PreTerm == Pre->begin() ||
17009 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17010 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17011 .addImm(1); // prefetch 2 lines behind PC
17012
17013 auto ExitHead = Exit->getFirstNonDebugInstr();
17014 if (ExitHead == Exit->end() ||
17015 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17016 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17017 .addImm(2); // prefetch 1 line behind PC
17018 }
17019
17020 return CacheLineAlign;
17021 }
17022
17023 LLVM_ATTRIBUTE_UNUSED
isCopyFromRegOfInlineAsm(const SDNode * N)17024 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17025 assert(N->getOpcode() == ISD::CopyFromReg);
17026 do {
17027 // Follow the chain until we find an INLINEASM node.
17028 N = N->getOperand(0).getNode();
17029 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17030 return true;
17031 } while (N->getOpcode() == ISD::CopyFromReg);
17032 return false;
17033 }
17034
isSDNodeSourceOfDivergence(const SDNode * N,FunctionLoweringInfo * FLI,UniformityInfo * UA) const17035 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
17036 FunctionLoweringInfo *FLI,
17037 UniformityInfo *UA) const {
17038 switch (N->getOpcode()) {
17039 case ISD::CopyFromReg: {
17040 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17041 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17042 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17043 Register Reg = R->getReg();
17044
17045 // FIXME: Why does this need to consider isLiveIn?
17046 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17047 return !TRI->isSGPRReg(MRI, Reg);
17048
17049 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17050 return UA->isDivergent(V);
17051
17052 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
17053 return !TRI->isSGPRReg(MRI, Reg);
17054 }
17055 case ISD::LOAD: {
17056 const LoadSDNode *L = cast<LoadSDNode>(N);
17057 unsigned AS = L->getAddressSpace();
17058 // A flat load may access private memory.
17059 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
17060 }
17061 case ISD::CALLSEQ_END:
17062 return true;
17063 case ISD::INTRINSIC_WO_CHAIN:
17064 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17065 case ISD::INTRINSIC_W_CHAIN:
17066 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17067 case AMDGPUISD::ATOMIC_CMP_SWAP:
17068 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
17069 case AMDGPUISD::BUFFER_ATOMIC_ADD:
17070 case AMDGPUISD::BUFFER_ATOMIC_SUB:
17071 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
17072 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
17073 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
17074 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
17075 case AMDGPUISD::BUFFER_ATOMIC_AND:
17076 case AMDGPUISD::BUFFER_ATOMIC_OR:
17077 case AMDGPUISD::BUFFER_ATOMIC_XOR:
17078 case AMDGPUISD::BUFFER_ATOMIC_INC:
17079 case AMDGPUISD::BUFFER_ATOMIC_DEC:
17080 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
17081 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
17082 case AMDGPUISD::BUFFER_ATOMIC_FADD:
17083 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
17084 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
17085 // Target-specific read-modify-write atomics are sources of divergence.
17086 return true;
17087 default:
17088 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17089 // Generic read-modify-write atomics are sources of divergence.
17090 return A->readMem() && A->writeMem();
17091 }
17092 return false;
17093 }
17094 }
17095
denormalsEnabledForType(const SelectionDAG & DAG,EVT VT) const17096 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
17097 EVT VT) const {
17098 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17099 case MVT::f32:
17100 return !denormalModeIsFlushAllF32(DAG.getMachineFunction());
17101 case MVT::f64:
17102 case MVT::f16:
17103 return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
17104 default:
17105 return false;
17106 }
17107 }
17108
denormalsEnabledForType(LLT Ty,const MachineFunction & MF) const17109 bool SITargetLowering::denormalsEnabledForType(
17110 LLT Ty, const MachineFunction &MF) const {
17111 switch (Ty.getScalarSizeInBits()) {
17112 case 32:
17113 return !denormalModeIsFlushAllF32(MF);
17114 case 64:
17115 case 16:
17116 return !denormalModeIsFlushAllF64F16(MF);
17117 default:
17118 return false;
17119 }
17120 }
17121
isKnownNeverNaNForTargetNode(SDValue Op,const APInt & DemandedElts,const SelectionDAG & DAG,bool SNaN,unsigned Depth) const17122 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
17123 const APInt &DemandedElts,
17124 const SelectionDAG &DAG,
17125 bool SNaN,
17126 unsigned Depth) const {
17127 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17128 const MachineFunction &MF = DAG.getMachineFunction();
17129 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
17130
17131 if (Info->getMode().DX10Clamp)
17132 return true; // Clamped to 0.
17133 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17134 }
17135
17136 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
17137 DAG, SNaN, Depth);
17138 }
17139
17140 // On older subtargets, global FP atomic instructions have a hardcoded FP mode
17141 // and do not support FP32 denormals, and only support v2f16/f64 denormals.
atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst * RMW)17142 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
17143 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17144 return true;
17145
17146 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
17147 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17148 if (DenormMode == DenormalMode::getPreserveSign())
17149 return true;
17150
17151 // TODO: Remove this.
17152 return RMW->getFunction()
17153 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17154 .getValueAsBool();
17155 }
17156
emitAtomicRMWLegalRemark(const AtomicRMWInst * RMW)17157 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
17158 LLVMContext &Ctx = RMW->getContext();
17159 StringRef MemScope =
17160 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17161
17162 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17163 << "Hardware instruction generated for atomic "
17164 << RMW->getOperationName(RMW->getOperation())
17165 << " operation at memory scope " << MemScope;
17166 }
17167
isV2F16OrV2BF16(Type * Ty)17168 static bool isV2F16OrV2BF16(Type *Ty) {
17169 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17170 Type *EltTy = VT->getElementType();
17171 return VT->getNumElements() == 2 &&
17172 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17173 }
17174
17175 return false;
17176 }
17177
isV2F16(Type * Ty)17178 static bool isV2F16(Type *Ty) {
17179 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17180 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17181 }
17182
isV2BF16(Type * Ty)17183 static bool isV2BF16(Type *Ty) {
17184 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17185 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17186 }
17187
17188 /// \return true if atomicrmw integer ops work for the type.
isAtomicRMWLegalIntTy(Type * Ty)17189 static bool isAtomicRMWLegalIntTy(Type *Ty) {
17190 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
17191 unsigned BW = IT->getBitWidth();
17192 return BW == 32 || BW == 64;
17193 }
17194
17195 return false;
17196 }
17197
17198 /// \return true if this atomicrmw xchg type can be selected.
isAtomicRMWLegalXChgTy(const AtomicRMWInst * RMW)17199 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17200 Type *Ty = RMW->getType();
17201 if (isAtomicRMWLegalIntTy(Ty))
17202 return true;
17203
17204 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
17205 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17206 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
17207 return BW == 32 || BW == 64;
17208 }
17209
17210 if (Ty->isFloatTy() || Ty->isDoubleTy())
17211 return true;
17212
17213 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
17214 return VT->getNumElements() == 2 &&
17215 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17216 }
17217
17218 return false;
17219 }
17220
17221 /// \returns true if it's valid to emit a native instruction for \p RMW, based
17222 /// on the properties of the target memory.
globalMemoryFPAtomicIsLegal(const GCNSubtarget & Subtarget,const AtomicRMWInst * RMW,bool HasSystemScope)17223 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17224 const AtomicRMWInst *RMW,
17225 bool HasSystemScope) {
17226 // The remote/fine-grained access logic is different from the integer
17227 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17228 // fine-grained access does not work, even for a device local allocation.
17229 //
17230 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17231 // allocations work.
17232 if (HasSystemScope) {
17233 if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
17234 RMW->hasMetadata("amdgpu.no.remote.memory"))
17235 return true;
17236 } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17237 return true;
17238
17239 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
17240 }
17241
17242 /// \return Action to perform on AtomicRMWInsts for integer operations.
17243 static TargetLowering::AtomicExpansionKind
atomicSupportedIfLegalIntType(const AtomicRMWInst * RMW)17244 atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
17245 return isAtomicRMWLegalIntTy(RMW->getType())
17246 ? TargetLowering::AtomicExpansionKind::None
17247 : TargetLowering::AtomicExpansionKind::CmpXChg;
17248 }
17249
17250 /// Return if a flat address space atomicrmw can access private memory.
flatInstrMayAccessPrivate(const Instruction * I)17251 static bool flatInstrMayAccessPrivate(const Instruction *I) {
17252 const MDNode *NoaliasAddrSpaceMD =
17253 I->getMetadata(LLVMContext::MD_noalias_addrspace);
17254 if (!NoaliasAddrSpaceMD)
17255 return true;
17256
17257 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
17258 ++I) {
17259 auto *Low = mdconst::extract<ConstantInt>(
17260 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
17261 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
17262 auto *High = mdconst::extract<ConstantInt>(
17263 NoaliasAddrSpaceMD->getOperand(2 * I + 1));
17264 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
17265 }
17266 }
17267
17268 return true;
17269 }
17270
17271 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * RMW) const17272 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17273 unsigned AS = RMW->getPointerAddressSpace();
17274 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17275 return AtomicExpansionKind::NotAtomic;
17276
17277 // 64-bit flat atomics that dynamically reside in private memory will silently
17278 // be dropped.
17279 //
17280 // Note that we will emit a new copy of the original atomic in the expansion,
17281 // which will be incrementally relegalized.
17282 const DataLayout &DL = RMW->getFunction()->getDataLayout();
17283 if (AS == AMDGPUAS::FLAT_ADDRESS &&
17284 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
17285 flatInstrMayAccessPrivate(RMW))
17286 return AtomicExpansionKind::Expand;
17287
17288 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17289 OptimizationRemarkEmitter ORE(RMW->getFunction());
17290 ORE.emit([=]() {
17291 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17292 });
17293 return Kind;
17294 };
17295
17296 auto SSID = RMW->getSyncScopeID();
17297 bool HasSystemScope =
17298 SSID == SyncScope::System ||
17299 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
17300
17301 auto Op = RMW->getOperation();
17302 switch (Op) {
17303 case AtomicRMWInst::Xchg: {
17304 // PCIe supports add and xchg for system atomics.
17305 return isAtomicRMWLegalXChgTy(RMW)
17306 ? TargetLowering::AtomicExpansionKind::None
17307 : TargetLowering::AtomicExpansionKind::CmpXChg;
17308 }
17309 case AtomicRMWInst::Add:
17310 case AtomicRMWInst::And:
17311 case AtomicRMWInst::UIncWrap:
17312 case AtomicRMWInst::UDecWrap:
17313 return atomicSupportedIfLegalIntType(RMW);
17314 case AtomicRMWInst::Sub:
17315 case AtomicRMWInst::Or:
17316 case AtomicRMWInst::Xor: {
17317 // Atomic sub/or/xor do not work over PCI express, but atomic add
17318 // does. InstCombine transforms these with 0 to or, so undo that.
17319 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17320 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17321 ConstVal && ConstVal->isNullValue())
17322 return AtomicExpansionKind::Expand;
17323 }
17324
17325 return atomicSupportedIfLegalIntType(RMW);
17326 }
17327 case AtomicRMWInst::FAdd: {
17328 Type *Ty = RMW->getType();
17329
17330 // TODO: Handle REGION_ADDRESS
17331 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17332 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
17333 // is fixed to round-to-nearest-even.
17334 //
17335 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17336 // round-to-nearest-even.
17337 //
17338 // We ignore the rounding mode problem, even in strictfp. The C++ standard
17339 // suggests it is OK if the floating-point mode may not match the calling
17340 // thread.
17341 if (Ty->isFloatTy()) {
17342 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
17343 : AtomicExpansionKind::CmpXChg;
17344 }
17345
17346 if (Ty->isDoubleTy()) {
17347 // Ignores denormal mode, but we don't consider flushing mandatory.
17348 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
17349 : AtomicExpansionKind::CmpXChg;
17350 }
17351
17352 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17353 return AtomicExpansionKind::None;
17354
17355 return AtomicExpansionKind::CmpXChg;
17356 }
17357
17358 // LDS atomics respect the denormal mode from the mode register.
17359 //
17360 // Traditionally f32 global/buffer memory atomics would unconditionally
17361 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17362 // flush.
17363 //
17364 // On targets with flat atomic fadd, denormals would flush depending on
17365 // whether the target address resides in LDS or global memory. We consider
17366 // this flat-maybe-flush as will-flush.
17367 if (Ty->isFloatTy() &&
17368 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
17369 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
17370 return AtomicExpansionKind::CmpXChg;
17371
17372 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17373 // safe. The message phrasing also should be better.
17374 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17375 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17376 // gfx942, gfx12
17377 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17378 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17379 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17380 // gfx90a, gfx942, gfx12
17381 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17382 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17383
17384 // gfx942, gfx12
17385 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17386 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17387 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17388 // gfx90a, gfx942, gfx12
17389 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17390 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17391
17392 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17393 // buffer. gfx12 does have the buffer version.
17394 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17395 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17396 }
17397
17398 // global and flat atomic fadd f64: gfx90a, gfx942.
17399 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17400 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17401
17402 if (AS != AMDGPUAS::FLAT_ADDRESS) {
17403 if (Ty->isFloatTy()) {
17404 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17405 // gfx11+.
17406 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17407 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17408 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17409 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17410 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17411 } else {
17412 // gfx908
17413 if (RMW->use_empty() &&
17414 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
17415 isV2F16(Ty))
17416 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17417 }
17418 }
17419
17420 // flat atomic fadd f32: gfx942, gfx11+.
17421 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17422 if (Subtarget->hasFlatAtomicFaddF32Inst())
17423 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17424
17425 // If it is in flat address space, and the type is float, we will try to
17426 // expand it, if the target supports global and lds atomic fadd. The
17427 // reason we need that is, in the expansion, we emit the check of
17428 // address space. If it is in global address space, we emit the global
17429 // atomic fadd; if it is in shared address space, we emit the LDS atomic
17430 // fadd.
17431 if (Subtarget->hasLDSFPAtomicAddF32()) {
17432 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17433 return AtomicExpansionKind::Expand;
17434 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17435 return AtomicExpansionKind::Expand;
17436 }
17437 }
17438 }
17439
17440 return AtomicExpansionKind::CmpXChg;
17441 }
17442 case AtomicRMWInst::FMin:
17443 case AtomicRMWInst::FMax: {
17444 Type *Ty = RMW->getType();
17445
17446 // LDS float and double fmin/fmax were always supported.
17447 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17448 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
17449 : AtomicExpansionKind::CmpXChg;
17450 }
17451
17452 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17453 // For flat and global cases:
17454 // float, double in gfx7. Manual claims denormal support.
17455 // Removed in gfx8.
17456 // float, double restored in gfx10.
17457 // double removed again in gfx11, so only f32 for gfx11/gfx12.
17458 //
17459 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
17460 // no f32.
17461 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17462 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
17463 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17464 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
17465 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17466 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
17467 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17468 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
17469 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17470 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
17471 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17472 }
17473 }
17474
17475 return AtomicExpansionKind::CmpXChg;
17476 }
17477 case AtomicRMWInst::Min:
17478 case AtomicRMWInst::Max:
17479 case AtomicRMWInst::UMin:
17480 case AtomicRMWInst::UMax: {
17481 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17482 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17483 // Always expand system scope min/max atomics.
17484 if (HasSystemScope)
17485 return AtomicExpansionKind::CmpXChg;
17486 }
17487
17488 return atomicSupportedIfLegalIntType(RMW);
17489 }
17490 case AtomicRMWInst::Nand:
17491 case AtomicRMWInst::FSub:
17492 default:
17493 return AtomicExpansionKind::CmpXChg;
17494 }
17495
17496 llvm_unreachable("covered atomicrmw op switch");
17497 }
17498
17499 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const17500 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17501 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17502 ? AtomicExpansionKind::NotAtomic
17503 : AtomicExpansionKind::None;
17504 }
17505
17506 TargetLowering::AtomicExpansionKind
shouldExpandAtomicStoreInIR(StoreInst * SI) const17507 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17508 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17509 ? AtomicExpansionKind::NotAtomic
17510 : AtomicExpansionKind::None;
17511 }
17512
17513 TargetLowering::AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst * CmpX) const17514 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
17515 unsigned AddrSpace = CmpX->getPointerAddressSpace();
17516 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
17517 return AtomicExpansionKind::NotAtomic;
17518
17519 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
17520 return AtomicExpansionKind::None;
17521
17522 const DataLayout &DL = CmpX->getDataLayout();
17523
17524 Type *ValTy = CmpX->getNewValOperand()->getType();
17525
17526 // If a 64-bit flat atomic may alias private, we need to avoid using the
17527 // atomic in the private case.
17528 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
17529 : AtomicExpansionKind::None;
17530 }
17531
17532 const TargetRegisterClass *
getRegClassFor(MVT VT,bool isDivergent) const17533 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
17534 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
17535 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17536 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
17537 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
17538 : &AMDGPU::SReg_32RegClass;
17539 if (!TRI->isSGPRClass(RC) && !isDivergent)
17540 return TRI->getEquivalentSGPRClass(RC);
17541 if (TRI->isSGPRClass(RC) && isDivergent)
17542 return TRI->getEquivalentVGPRClass(RC);
17543
17544 return RC;
17545 }
17546
17547 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
17548 // uniform values (as produced by the mask results of control flow intrinsics)
17549 // used outside of divergent blocks. The phi users need to also be treated as
17550 // always uniform.
17551 //
17552 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
hasCFUser(const Value * V,SmallPtrSet<const Value *,16> & Visited,unsigned WaveSize)17553 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17554 unsigned WaveSize) {
17555 // FIXME: We assume we never cast the mask results of a control flow
17556 // intrinsic.
17557 // Early exit if the type won't be consistent as a compile time hack.
17558 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
17559 if (!IT || IT->getBitWidth() != WaveSize)
17560 return false;
17561
17562 if (!isa<Instruction>(V))
17563 return false;
17564 if (!Visited.insert(V).second)
17565 return false;
17566 bool Result = false;
17567 for (const auto *U : V->users()) {
17568 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17569 if (V == U->getOperand(1)) {
17570 switch (Intrinsic->getIntrinsicID()) {
17571 default:
17572 Result = false;
17573 break;
17574 case Intrinsic::amdgcn_if_break:
17575 case Intrinsic::amdgcn_if:
17576 case Intrinsic::amdgcn_else:
17577 Result = true;
17578 break;
17579 }
17580 }
17581 if (V == U->getOperand(0)) {
17582 switch (Intrinsic->getIntrinsicID()) {
17583 default:
17584 Result = false;
17585 break;
17586 case Intrinsic::amdgcn_end_cf:
17587 case Intrinsic::amdgcn_loop:
17588 Result = true;
17589 break;
17590 }
17591 }
17592 } else {
17593 Result = hasCFUser(U, Visited, WaveSize);
17594 }
17595 if (Result)
17596 break;
17597 }
17598 return Result;
17599 }
17600
requiresUniformRegister(MachineFunction & MF,const Value * V) const17601 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
17602 const Value *V) const {
17603 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
17604 if (CI->isInlineAsm()) {
17605 // FIXME: This cannot give a correct answer. This should only trigger in
17606 // the case where inline asm returns mixed SGPR and VGPR results, used
17607 // outside the defining block. We don't have a specific result to
17608 // consider, so this assumes if any value is SGPR, the overall register
17609 // also needs to be SGPR.
17610 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17611 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
17612 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
17613 for (auto &TC : TargetConstraints) {
17614 if (TC.Type == InlineAsm::isOutput) {
17615 ComputeConstraintToUse(TC, SDValue());
17616 const TargetRegisterClass *RC =
17617 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
17618 TC.ConstraintVT)
17619 .second;
17620 if (RC && SIRI->isSGPRClass(RC))
17621 return true;
17622 }
17623 }
17624 }
17625 }
17626 SmallPtrSet<const Value *, 16> Visited;
17627 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17628 }
17629
hasMemSDNodeUser(SDNode * N) const17630 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
17631 for (SDUse &Use : N->uses()) {
17632 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17633 if (getBasePtrIndex(M) == Use.getOperandNo())
17634 return true;
17635 }
17636 }
17637 return false;
17638 }
17639
isReassocProfitable(SelectionDAG & DAG,SDValue N0,SDValue N1) const17640 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
17641 SDValue N1) const {
17642 if (!N0.hasOneUse())
17643 return false;
17644 // Take care of the opportunity to keep N0 uniform
17645 if (N0->isDivergent() || !N1->isDivergent())
17646 return true;
17647 // Check if we have a good chance to form the memory access pattern with the
17648 // base and offset
17649 return (DAG.isBaseWithConstantOffset(N0) &&
17650 hasMemSDNodeUser(*N0->user_begin()));
17651 }
17652
isReassocProfitable(MachineRegisterInfo & MRI,Register N0,Register N1) const17653 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
17654 Register N0, Register N1) const {
17655 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17656 }
17657
17658 MachineMemOperand::Flags
getTargetMMOFlags(const Instruction & I) const17659 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
17660 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17661 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
17662 if (I.getMetadata("amdgpu.noclobber"))
17663 Flags |= MONoClobber;
17664 if (I.getMetadata("amdgpu.last.use"))
17665 Flags |= MOLastUse;
17666 return Flags;
17667 }
17668
checkForPhysRegDependency(SDNode * Def,SDNode * User,unsigned Op,const TargetRegisterInfo * TRI,const TargetInstrInfo * TII,MCRegister & PhysReg,int & Cost) const17669 bool SITargetLowering::checkForPhysRegDependency(
17670 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17671 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
17672 if (User->getOpcode() != ISD::CopyToReg)
17673 return false;
17674 if (!Def->isMachineOpcode())
17675 return false;
17676 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17677 if (!MDef)
17678 return false;
17679
17680 unsigned ResNo = User->getOperand(Op).getResNo();
17681 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17682 return false;
17683 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17684 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17685 PhysReg = AMDGPU::SCC;
17686 const TargetRegisterClass *RC =
17687 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17688 Cost = RC->getCopyCost();
17689 return true;
17690 }
17691 return false;
17692 }
17693
emitExpandAtomicAddrSpacePredicate(Instruction * AI) const17694 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
17695 Instruction *AI) const {
17696 // Given: atomicrmw fadd ptr %addr, float %val ordering
17697 //
17698 // With this expansion we produce the following code:
17699 // [...]
17700 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17701 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17702 //
17703 // atomicrmw.shared:
17704 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17705 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17706 // float %val ordering
17707 // br label %atomicrmw.phi
17708 //
17709 // atomicrmw.check.private:
17710 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17711 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17712 //
17713 // atomicrmw.private:
17714 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17715 // %loaded.private = load float, ptr addrspace(5) %cast.private
17716 // %val.new = fadd float %loaded.private, %val
17717 // store float %val.new, ptr addrspace(5) %cast.private
17718 // br label %atomicrmw.phi
17719 //
17720 // atomicrmw.global:
17721 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17722 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17723 // float %val ordering
17724 // br label %atomicrmw.phi
17725 //
17726 // atomicrmw.phi:
17727 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17728 // [ %loaded.private, %atomicrmw.private ],
17729 // [ %loaded.global, %atomicrmw.global ]
17730 // br label %atomicrmw.end
17731 //
17732 // atomicrmw.end:
17733 // [...]
17734 //
17735 //
17736 // For 64-bit atomics which may reside in private memory, we perform a simpler
17737 // version that only inserts the private check, and uses the flat operation.
17738
17739 IRBuilder<> Builder(AI);
17740 LLVMContext &Ctx = Builder.getContext();
17741
17742 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17743 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17744 : AtomicCmpXchgInst::getPointerOperandIndex();
17745 Value *Addr = AI->getOperand(PtrOpIdx);
17746
17747 /// TODO: Only need to check private, then emit flat-known-not private (no
17748 /// need for shared block, or cast to global).
17749 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17750
17751 Align Alignment;
17752 if (RMW)
17753 Alignment = RMW->getAlign();
17754 else if (CX)
17755 Alignment = CX->getAlign();
17756 else
17757 llvm_unreachable("unhandled atomic operation");
17758
17759 // FullFlatEmulation is true if we need to issue the private, shared, and
17760 // global cases.
17761 //
17762 // If this is false, we are only dealing with the flat-targeting-private case,
17763 // where we only insert a check for private and still use the flat instruction
17764 // for global and shared.
17765
17766 bool FullFlatEmulation =
17767 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17768 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
17769 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
17770 RMW->getType()->isDoubleTy()));
17771
17772 // If the return value isn't used, do not introduce a false use in the phi.
17773 bool ReturnValueIsUsed = !AI->use_empty();
17774
17775 BasicBlock *BB = Builder.GetInsertBlock();
17776 Function *F = BB->getParent();
17777 BasicBlock *ExitBB =
17778 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17779 BasicBlock *SharedBB = nullptr;
17780
17781 BasicBlock *CheckPrivateBB = BB;
17782 if (FullFlatEmulation) {
17783 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17784 CheckPrivateBB =
17785 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17786 }
17787
17788 BasicBlock *PrivateBB =
17789 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17790 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17791 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17792
17793 std::prev(BB->end())->eraseFromParent();
17794 Builder.SetInsertPoint(BB);
17795
17796 Value *LoadedShared = nullptr;
17797 if (FullFlatEmulation) {
17798 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
17799 {Addr}, nullptr, "is.shared");
17800 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17801 Builder.SetInsertPoint(SharedBB);
17802 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17803 Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
17804
17805 Instruction *Clone = AI->clone();
17806 Clone->insertInto(SharedBB, SharedBB->end());
17807 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17808 LoadedShared = Clone;
17809
17810 Builder.CreateBr(PhiBB);
17811 Builder.SetInsertPoint(CheckPrivateBB);
17812 }
17813
17814 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
17815 {Addr}, nullptr, "is.private");
17816 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17817
17818 Builder.SetInsertPoint(PrivateBB);
17819
17820 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17821 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
17822
17823 Value *LoadedPrivate;
17824 if (RMW) {
17825 LoadedPrivate = Builder.CreateAlignedLoad(
17826 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17827
17828 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17829 LoadedPrivate, RMW->getValOperand());
17830
17831 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17832 } else {
17833 auto [ResultLoad, Equal] =
17834 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17835 CX->getNewValOperand(), CX->getAlign());
17836
17837 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17838 ResultLoad, 0);
17839 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17840 }
17841
17842 Builder.CreateBr(PhiBB);
17843
17844 Builder.SetInsertPoint(GlobalBB);
17845
17846 // Continue using a flat instruction if we only emitted the check for private.
17847 Instruction *LoadedGlobal = AI;
17848 if (FullFlatEmulation) {
17849 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17850 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
17851 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17852 }
17853
17854 AI->removeFromParent();
17855 AI->insertInto(GlobalBB, GlobalBB->end());
17856
17857 // The new atomicrmw may go through another round of legalization later.
17858 if (!FullFlatEmulation) {
17859 // We inserted the runtime check already, make sure we do not try to
17860 // re-expand this.
17861 // TODO: Should union with any existing metadata.
17862 MDBuilder MDB(F->getContext());
17863 MDNode *RangeNotPrivate =
17864 MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
17865 APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
17866 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17867 RangeNotPrivate);
17868 }
17869
17870 Builder.CreateBr(PhiBB);
17871
17872 Builder.SetInsertPoint(PhiBB);
17873
17874 if (ReturnValueIsUsed) {
17875 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17876 AI->replaceAllUsesWith(Loaded);
17877 if (FullFlatEmulation)
17878 Loaded->addIncoming(LoadedShared, SharedBB);
17879 Loaded->addIncoming(LoadedPrivate, PrivateBB);
17880 Loaded->addIncoming(LoadedGlobal, GlobalBB);
17881 Loaded->takeName(AI);
17882 }
17883
17884 Builder.CreateBr(ExitBB);
17885 }
17886
emitExpandAtomicRMW(AtomicRMWInst * AI) const17887 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
17888 AtomicRMWInst::BinOp Op = AI->getOperation();
17889
17890 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17891 Op == AtomicRMWInst::Xor) {
17892 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17893 ConstVal && ConstVal->isNullValue()) {
17894 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17895 AI->setOperation(AtomicRMWInst::Add);
17896
17897 // We may still need the private-alias-flat handling below.
17898
17899 // TODO: Skip this for cases where we cannot access remote memory.
17900 }
17901 }
17902
17903 // The non-flat expansions should only perform the de-canonicalization of
17904 // identity values.
17905 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
17906 return;
17907
17908 emitExpandAtomicAddrSpacePredicate(AI);
17909 }
17910
emitExpandAtomicCmpXchg(AtomicCmpXchgInst * CI) const17911 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17912 emitExpandAtomicAddrSpacePredicate(CI);
17913 }
17914
17915 LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst * AI) const17916 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
17917 IRBuilder<> Builder(AI);
17918 auto Order = AI->getOrdering();
17919
17920 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17921 // must be flushed if the atomic ordering had a release semantics. This is
17922 // not necessary a fence, a release fence just coincides to do that flush.
17923 // Avoid replacing of an atomicrmw with a release semantics.
17924 if (isReleaseOrStronger(Order))
17925 return nullptr;
17926
17927 LoadInst *LI = Builder.CreateAlignedLoad(
17928 AI->getType(), AI->getPointerOperand(), AI->getAlign());
17929 LI->setAtomic(Order, AI->getSyncScopeID());
17930 LI->copyMetadata(*AI);
17931 LI->takeName(AI);
17932 AI->replaceAllUsesWith(LI);
17933 AI->eraseFromParent();
17934 return LI;
17935 }
17936