1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/Analysis.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/PatternMatch.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/KnownBits.h"
28 #include "llvm/Target/TargetMachine.h"
29
30 using namespace llvm;
31
32 #include "AMDGPUGenCallingConv.inc"
33
34 static cl::opt<bool> AMDGPUBypassSlowDiv(
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39 // Find a larger type to do a load / store of a vector with.
getEquivalentMemType(LLVMContext & Ctx,EVT VT)40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49 }
50
numBitsUnsigned(SDValue Op,SelectionDAG & DAG)51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52 return DAG.computeKnownBits(Op).countMaxActiveBits();
53 }
54
numBitsSigned(SDValue Op,SelectionDAG & DAG)55 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59 }
60
AMDGPUTargetLowering(const TargetMachine & TM,const AMDGPUSubtarget & STI)61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
66 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
67 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
68 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
71 MaxGluedStoresPerMemcpy = 16;
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
75 setOperationAction(ISD::LOAD, MVT::f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
117 setOperationAction(ISD::LOAD, MVT::i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
123 setOperationAction(ISD::LOAD, MVT::f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
153 setOperationAction(ISD::LOAD, MVT::i128, Promote);
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
184 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
199 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
202 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
240 setOperationAction(ISD::STORE, MVT::f32, Promote);
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
243 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
246 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
249 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
252 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
255 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
258 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
261 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
264 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
267 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
270 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
273 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
276 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
279 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
282 setOperationAction(ISD::STORE, MVT::i64, Promote);
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
285 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
288 setOperationAction(ISD::STORE, MVT::f64, Promote);
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
291 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
294 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
297 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
300 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
303 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
306 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
309 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
312 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
315 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
318 setOperationAction(ISD::STORE, MVT::i128, Promote);
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
386 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
390 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
394 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
395 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
396 MVT::f32, Legal);
397
398 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
400
401 setOperationAction(
402 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
403 Custom);
404
405 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
406
407 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 if (Subtarget->has16BitInsts())
412 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
413 else {
414 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
415 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
416 }
417
418 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
419 Custom);
420
421 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
422 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
423 // default unless marked custom/legal.
424 setOperationAction(
425 ISD::IS_FPCLASS,
426 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
427 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
428 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
429 Custom);
430
431 // Expand to fneg + fadd.
432 setOperationAction(ISD::FSUB, MVT::f64, Expand);
433
434 setOperationAction(ISD::CONCAT_VECTORS,
435 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
436 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
437 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
438 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
439 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
440 Custom);
441
442 // FIXME: Why is v8f16/v8bf16 missing?
443 setOperationAction(
444 ISD::EXTRACT_SUBVECTOR,
445 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
446 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
447 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
448 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
449 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
450 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
451 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
452 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
453 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
454 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
455 Custom);
456
457 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
458 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
459
460 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
461 for (MVT VT : ScalarIntVTs) {
462 // These should use [SU]DIVREM, so set them to expand
463 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
464 Expand);
465
466 // GPU does not have divrem function for signed or unsigned.
467 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
468
469 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
470 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
471
472 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
473
474 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
475 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
476 }
477
478 // The hardware supports 32-bit FSHR, but not FSHL.
479 setOperationAction(ISD::FSHR, MVT::i32, Legal);
480
481 // The hardware supports 32-bit ROTR, but not ROTL.
482 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
483 setOperationAction(ISD::ROTR, MVT::i64, Expand);
484
485 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
486
487 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
488 setOperationAction(
489 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
490 MVT::i64, Custom);
491 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
492
493 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
494 Legal);
495
496 setOperationAction(
497 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
498 MVT::i64, Custom);
499
500 for (auto VT : {MVT::i8, MVT::i16})
501 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
502
503 static const MVT::SimpleValueType VectorIntTypes[] = {
504 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
505 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
506
507 for (MVT VT : VectorIntTypes) {
508 // Expand the following operations for the current type by default.
509 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
510 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
511 ISD::MULHS, ISD::OR, ISD::SHL,
512 ISD::SRA, ISD::SRL, ISD::ROTL,
513 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
514 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
515 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
516 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
517 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
518 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
519 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
520 ISD::SETCC},
521 VT, Expand);
522 }
523
524 static const MVT::SimpleValueType FloatVectorTypes[] = {
525 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
526 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
527
528 for (MVT VT : FloatVectorTypes) {
529 setOperationAction(
530 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
531 ISD::FADD, ISD::FCEIL, ISD::FCOS,
532 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
533 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
534 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
535 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
536 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
537 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
538 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
539 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
540 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
541 VT, Expand);
542 }
543
544 // This causes using an unrolled select operation rather than expansion with
545 // bit operations. This is in general better, but the alternative using BFI
546 // instructions may be better if the select sources are SGPRs.
547 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
548 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
549
550 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
551 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
552
553 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
554 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
555
556 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
557 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
558
559 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
560 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
561
562 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
563 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
564
565 setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
566 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
567
568 setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
569 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
570
571 setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
572 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
573
574 setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
575 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
576
577 setSchedulingPreference(Sched::RegPressure);
578 setJumpIsExpensive(true);
579
580 // FIXME: This is only partially true. If we have to do vector compares, any
581 // SGPR pair can be a condition register. If we have a uniform condition, we
582 // are better off doing SALU operations, where there is only one SCC. For now,
583 // we don't have a way of knowing during instruction selection if a condition
584 // will be uniform and we always use vector compares. Assume we are using
585 // vector compares until that is fixed.
586 setHasMultipleConditionRegisters(true);
587
588 setMinCmpXchgSizeInBits(32);
589 setSupportsUnalignedAtomics(false);
590
591 PredictableSelectIsExpensive = false;
592
593 // We want to find all load dependencies for long chains of stores to enable
594 // merging into very wide vectors. The problem is with vectors with > 4
595 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
596 // vectors are a legal type, even though we have to split the loads
597 // usually. When we can more precisely specify load legality per address
598 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
599 // smarter so that they can figure out what to do in 2 iterations without all
600 // N > 4 stores on the same chain.
601 GatherAllAliasesMaxDepth = 16;
602
603 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
604 // about these during lowering.
605 MaxStoresPerMemcpy = 0xffffffff;
606 MaxStoresPerMemmove = 0xffffffff;
607 MaxStoresPerMemset = 0xffffffff;
608
609 // The expansion for 64-bit division is enormous.
610 if (AMDGPUBypassSlowDiv)
611 addBypassSlowDiv(64, 32);
612
613 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
614 ISD::SRA, ISD::SRL,
615 ISD::TRUNCATE, ISD::MUL,
616 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
617 ISD::MULHU, ISD::MULHS,
618 ISD::SELECT, ISD::SELECT_CC,
619 ISD::STORE, ISD::FADD,
620 ISD::FSUB, ISD::FNEG,
621 ISD::FABS, ISD::AssertZext,
622 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
623
624 setMaxAtomicSizeInBitsSupported(64);
625 setMaxDivRemBitWidthSupported(64);
626 setMaxLargeFPConvertBitWidthSupported(64);
627 }
628
mayIgnoreSignedZero(SDValue Op) const629 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
630 if (getTargetMachine().Options.NoSignedZerosFPMath)
631 return true;
632
633 const auto Flags = Op.getNode()->getFlags();
634 if (Flags.hasNoSignedZeros())
635 return true;
636
637 return false;
638 }
639
640 //===----------------------------------------------------------------------===//
641 // Target Information
642 //===----------------------------------------------------------------------===//
643
644 LLVM_READNONE
fnegFoldsIntoOpcode(unsigned Opc)645 static bool fnegFoldsIntoOpcode(unsigned Opc) {
646 switch (Opc) {
647 case ISD::FADD:
648 case ISD::FSUB:
649 case ISD::FMUL:
650 case ISD::FMA:
651 case ISD::FMAD:
652 case ISD::FMINNUM:
653 case ISD::FMAXNUM:
654 case ISD::FMINNUM_IEEE:
655 case ISD::FMAXNUM_IEEE:
656 case ISD::FMINIMUM:
657 case ISD::FMAXIMUM:
658 case ISD::SELECT:
659 case ISD::FSIN:
660 case ISD::FTRUNC:
661 case ISD::FRINT:
662 case ISD::FNEARBYINT:
663 case ISD::FROUNDEVEN:
664 case ISD::FCANONICALIZE:
665 case AMDGPUISD::RCP:
666 case AMDGPUISD::RCP_LEGACY:
667 case AMDGPUISD::RCP_IFLAG:
668 case AMDGPUISD::SIN_HW:
669 case AMDGPUISD::FMUL_LEGACY:
670 case AMDGPUISD::FMIN_LEGACY:
671 case AMDGPUISD::FMAX_LEGACY:
672 case AMDGPUISD::FMED3:
673 // TODO: handle llvm.amdgcn.fma.legacy
674 return true;
675 case ISD::BITCAST:
676 llvm_unreachable("bitcast is special cased");
677 default:
678 return false;
679 }
680 }
681
fnegFoldsIntoOp(const SDNode * N)682 static bool fnegFoldsIntoOp(const SDNode *N) {
683 unsigned Opc = N->getOpcode();
684 if (Opc == ISD::BITCAST) {
685 // TODO: Is there a benefit to checking the conditions performFNegCombine
686 // does? We don't for the other cases.
687 SDValue BCSrc = N->getOperand(0);
688 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
689 return BCSrc.getNumOperands() == 2 &&
690 BCSrc.getOperand(1).getValueSizeInBits() == 32;
691 }
692
693 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
694 }
695
696 return fnegFoldsIntoOpcode(Opc);
697 }
698
699 /// \p returns true if the operation will definitely need to use a 64-bit
700 /// encoding, and thus will use a VOP3 encoding regardless of the source
701 /// modifiers.
702 LLVM_READONLY
opMustUseVOP3Encoding(const SDNode * N,MVT VT)703 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
704 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
705 VT == MVT::f64;
706 }
707
708 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
709 /// type for ISD::SELECT.
710 LLVM_READONLY
selectSupportsSourceMods(const SDNode * N)711 static bool selectSupportsSourceMods(const SDNode *N) {
712 // TODO: Only applies if select will be vector
713 return N->getValueType(0) == MVT::f32;
714 }
715
716 // Most FP instructions support source modifiers, but this could be refined
717 // slightly.
718 LLVM_READONLY
hasSourceMods(const SDNode * N)719 static bool hasSourceMods(const SDNode *N) {
720 if (isa<MemSDNode>(N))
721 return false;
722
723 switch (N->getOpcode()) {
724 case ISD::CopyToReg:
725 case ISD::FDIV:
726 case ISD::FREM:
727 case ISD::INLINEASM:
728 case ISD::INLINEASM_BR:
729 case AMDGPUISD::DIV_SCALE:
730 case ISD::INTRINSIC_W_CHAIN:
731
732 // TODO: Should really be looking at the users of the bitcast. These are
733 // problematic because bitcasts are used to legalize all stores to integer
734 // types.
735 case ISD::BITCAST:
736 return false;
737 case ISD::INTRINSIC_WO_CHAIN: {
738 switch (N->getConstantOperandVal(0)) {
739 case Intrinsic::amdgcn_interp_p1:
740 case Intrinsic::amdgcn_interp_p2:
741 case Intrinsic::amdgcn_interp_mov:
742 case Intrinsic::amdgcn_interp_p1_f16:
743 case Intrinsic::amdgcn_interp_p2_f16:
744 return false;
745 default:
746 return true;
747 }
748 }
749 case ISD::SELECT:
750 return selectSupportsSourceMods(N);
751 default:
752 return true;
753 }
754 }
755
allUsesHaveSourceMods(const SDNode * N,unsigned CostThreshold)756 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
757 unsigned CostThreshold) {
758 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
759 // it is truly free to use a source modifier in all cases. If there are
760 // multiple users but for each one will necessitate using VOP3, there will be
761 // a code size increase. Try to avoid increasing code size unless we know it
762 // will save on the instruction count.
763 unsigned NumMayIncreaseSize = 0;
764 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
765
766 assert(!N->use_empty());
767
768 // XXX - Should this limit number of uses to check?
769 for (const SDNode *U : N->uses()) {
770 if (!hasSourceMods(U))
771 return false;
772
773 if (!opMustUseVOP3Encoding(U, VT)) {
774 if (++NumMayIncreaseSize > CostThreshold)
775 return false;
776 }
777 }
778
779 return true;
780 }
781
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const782 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
783 ISD::NodeType ExtendKind) const {
784 assert(!VT.isVector() && "only scalar expected");
785
786 // Round to the next multiple of 32-bits.
787 unsigned Size = VT.getSizeInBits();
788 if (Size <= 32)
789 return MVT::i32;
790 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
791 }
792
getVectorIdxTy(const DataLayout &) const793 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
794 return MVT::i32;
795 }
796
isSelectSupported(SelectSupportKind SelType) const797 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
798 return true;
799 }
800
801 // The backend supports 32 and 64 bit floating point immediates.
802 // FIXME: Why are we reporting vectors of FP immediates as legal?
isFPImmLegal(const APFloat & Imm,EVT VT,bool ForCodeSize) const803 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
804 bool ForCodeSize) const {
805 EVT ScalarVT = VT.getScalarType();
806 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
807 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
808 }
809
810 // We don't want to shrink f64 / f32 constants.
ShouldShrinkFPConstant(EVT VT) const811 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
812 EVT ScalarVT = VT.getScalarType();
813 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
814 }
815
shouldReduceLoadWidth(SDNode * N,ISD::LoadExtType ExtTy,EVT NewVT) const816 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
817 ISD::LoadExtType ExtTy,
818 EVT NewVT) const {
819 // TODO: This may be worth removing. Check regression tests for diffs.
820 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
821 return false;
822
823 unsigned NewSize = NewVT.getStoreSizeInBits();
824
825 // If we are reducing to a 32-bit load or a smaller multi-dword load,
826 // this is always better.
827 if (NewSize >= 32)
828 return true;
829
830 EVT OldVT = N->getValueType(0);
831 unsigned OldSize = OldVT.getStoreSizeInBits();
832
833 MemSDNode *MN = cast<MemSDNode>(N);
834 unsigned AS = MN->getAddressSpace();
835 // Do not shrink an aligned scalar load to sub-dword.
836 // Scalar engine cannot do sub-dword loads.
837 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
838 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
839 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
840 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
841 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
842 MN->isInvariant())) &&
843 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
844 return false;
845
846 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
847 // extloads, so doing one requires using a buffer_load. In cases where we
848 // still couldn't use a scalar load, using the wider load shouldn't really
849 // hurt anything.
850
851 // If the old size already had to be an extload, there's no harm in continuing
852 // to reduce the width.
853 return (OldSize < 32);
854 }
855
isLoadBitCastBeneficial(EVT LoadTy,EVT CastTy,const SelectionDAG & DAG,const MachineMemOperand & MMO) const856 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
857 const SelectionDAG &DAG,
858 const MachineMemOperand &MMO) const {
859
860 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
861
862 if (LoadTy.getScalarType() == MVT::i32)
863 return false;
864
865 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
866 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
867
868 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
869 return false;
870
871 unsigned Fast = 0;
872 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
873 CastTy, MMO, &Fast) &&
874 Fast;
875 }
876
877 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
878 // profitable with the expansion for 64-bit since it's generally good to
879 // speculate things.
isCheapToSpeculateCttz(Type * Ty) const880 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
881 return true;
882 }
883
isCheapToSpeculateCtlz(Type * Ty) const884 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
885 return true;
886 }
887
isSDNodeAlwaysUniform(const SDNode * N) const888 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
889 switch (N->getOpcode()) {
890 case ISD::EntryToken:
891 case ISD::TokenFactor:
892 return true;
893 case ISD::INTRINSIC_WO_CHAIN: {
894 unsigned IntrID = N->getConstantOperandVal(0);
895 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
896 }
897 case ISD::LOAD:
898 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
899 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
900 return true;
901 return false;
902 case AMDGPUISD::SETCC: // ballot-style instruction
903 return true;
904 }
905 return false;
906 }
907
getNegatedExpression(SDValue Op,SelectionDAG & DAG,bool LegalOperations,bool ForCodeSize,NegatibleCost & Cost,unsigned Depth) const908 SDValue AMDGPUTargetLowering::getNegatedExpression(
909 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
910 NegatibleCost &Cost, unsigned Depth) const {
911
912 switch (Op.getOpcode()) {
913 case ISD::FMA:
914 case ISD::FMAD: {
915 // Negating a fma is not free if it has users without source mods.
916 if (!allUsesHaveSourceMods(Op.getNode()))
917 return SDValue();
918 break;
919 }
920 case AMDGPUISD::RCP: {
921 SDValue Src = Op.getOperand(0);
922 EVT VT = Op.getValueType();
923 SDLoc SL(Op);
924
925 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
926 ForCodeSize, Cost, Depth + 1);
927 if (NegSrc)
928 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
929 return SDValue();
930 }
931 default:
932 break;
933 }
934
935 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
936 ForCodeSize, Cost, Depth);
937 }
938
939 //===---------------------------------------------------------------------===//
940 // Target Properties
941 //===---------------------------------------------------------------------===//
942
isFAbsFree(EVT VT) const943 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
944 assert(VT.isFloatingPoint());
945
946 // Packed operations do not have a fabs modifier.
947 return VT == MVT::f32 || VT == MVT::f64 ||
948 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
949 }
950
isFNegFree(EVT VT) const951 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
952 assert(VT.isFloatingPoint());
953 // Report this based on the end legalized type.
954 VT = VT.getScalarType();
955 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
956 }
957
storeOfVectorConstantIsCheap(bool IsZero,EVT MemVT,unsigned NumElem,unsigned AS) const958 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
959 unsigned NumElem,
960 unsigned AS) const {
961 return true;
962 }
963
aggressivelyPreferBuildVectorSources(EVT VecVT) const964 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
965 // There are few operations which truly have vector input operands. Any vector
966 // operation is going to involve operations on each component, and a
967 // build_vector will be a copy per element, so it always makes sense to use a
968 // build_vector input in place of the extracted element to avoid a copy into a
969 // super register.
970 //
971 // We should probably only do this if all users are extracts only, but this
972 // should be the common case.
973 return true;
974 }
975
isTruncateFree(EVT Source,EVT Dest) const976 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
977 // Truncate is just accessing a subregister.
978
979 unsigned SrcSize = Source.getSizeInBits();
980 unsigned DestSize = Dest.getSizeInBits();
981
982 return DestSize < SrcSize && DestSize % 32 == 0 ;
983 }
984
isTruncateFree(Type * Source,Type * Dest) const985 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
986 // Truncate is just accessing a subregister.
987
988 unsigned SrcSize = Source->getScalarSizeInBits();
989 unsigned DestSize = Dest->getScalarSizeInBits();
990
991 if (DestSize== 16 && Subtarget->has16BitInsts())
992 return SrcSize >= 32;
993
994 return DestSize < SrcSize && DestSize % 32 == 0;
995 }
996
isZExtFree(Type * Src,Type * Dest) const997 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
998 unsigned SrcSize = Src->getScalarSizeInBits();
999 unsigned DestSize = Dest->getScalarSizeInBits();
1000
1001 if (SrcSize == 16 && Subtarget->has16BitInsts())
1002 return DestSize >= 32;
1003
1004 return SrcSize == 32 && DestSize == 64;
1005 }
1006
isZExtFree(EVT Src,EVT Dest) const1007 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1008 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1009 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1010 // this will enable reducing 64-bit operations the 32-bit, which is always
1011 // good.
1012
1013 if (Src == MVT::i16)
1014 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1015
1016 return Src == MVT::i32 && Dest == MVT::i64;
1017 }
1018
isNarrowingProfitable(EVT SrcVT,EVT DestVT) const1019 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
1020 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1021 // limited number of native 64-bit operations. Shrinking an operation to fit
1022 // in a single 32-bit register should always be helpful. As currently used,
1023 // this is much less general than the name suggests, and is only used in
1024 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1025 // not profitable, and may actually be harmful.
1026 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1027 }
1028
isDesirableToCommuteWithShift(const SDNode * N,CombineLevel Level) const1029 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1030 const SDNode* N, CombineLevel Level) const {
1031 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1032 N->getOpcode() == ISD::SRL) &&
1033 "Expected shift op");
1034 // Always commute pre-type legalization and right shifts.
1035 // We're looking for shl(or(x,y),z) patterns.
1036 if (Level < CombineLevel::AfterLegalizeTypes ||
1037 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1038 return true;
1039
1040 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1041 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1042 (N->use_begin()->getOpcode() == ISD::SRA ||
1043 N->use_begin()->getOpcode() == ISD::SRL))
1044 return false;
1045
1046 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1047 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1048 if (LHS.getOpcode() != ISD::SHL)
1049 return false;
1050 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1051 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1052 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1053 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1054 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1055 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1056 };
1057 SDValue LHS = N->getOperand(0).getOperand(0);
1058 SDValue RHS = N->getOperand(0).getOperand(1);
1059 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1060 }
1061
1062 //===---------------------------------------------------------------------===//
1063 // TargetLowering Callbacks
1064 //===---------------------------------------------------------------------===//
1065
CCAssignFnForCall(CallingConv::ID CC,bool IsVarArg)1066 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1067 bool IsVarArg) {
1068 switch (CC) {
1069 case CallingConv::AMDGPU_VS:
1070 case CallingConv::AMDGPU_GS:
1071 case CallingConv::AMDGPU_PS:
1072 case CallingConv::AMDGPU_CS:
1073 case CallingConv::AMDGPU_HS:
1074 case CallingConv::AMDGPU_ES:
1075 case CallingConv::AMDGPU_LS:
1076 return CC_AMDGPU;
1077 case CallingConv::AMDGPU_CS_Chain:
1078 case CallingConv::AMDGPU_CS_ChainPreserve:
1079 return CC_AMDGPU_CS_CHAIN;
1080 case CallingConv::C:
1081 case CallingConv::Fast:
1082 case CallingConv::Cold:
1083 return CC_AMDGPU_Func;
1084 case CallingConv::AMDGPU_Gfx:
1085 return CC_SI_Gfx;
1086 case CallingConv::AMDGPU_KERNEL:
1087 case CallingConv::SPIR_KERNEL:
1088 default:
1089 report_fatal_error("Unsupported calling convention for call");
1090 }
1091 }
1092
CCAssignFnForReturn(CallingConv::ID CC,bool IsVarArg)1093 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1094 bool IsVarArg) {
1095 switch (CC) {
1096 case CallingConv::AMDGPU_KERNEL:
1097 case CallingConv::SPIR_KERNEL:
1098 llvm_unreachable("kernels should not be handled here");
1099 case CallingConv::AMDGPU_VS:
1100 case CallingConv::AMDGPU_GS:
1101 case CallingConv::AMDGPU_PS:
1102 case CallingConv::AMDGPU_CS:
1103 case CallingConv::AMDGPU_CS_Chain:
1104 case CallingConv::AMDGPU_CS_ChainPreserve:
1105 case CallingConv::AMDGPU_HS:
1106 case CallingConv::AMDGPU_ES:
1107 case CallingConv::AMDGPU_LS:
1108 return RetCC_SI_Shader;
1109 case CallingConv::AMDGPU_Gfx:
1110 return RetCC_SI_Gfx;
1111 case CallingConv::C:
1112 case CallingConv::Fast:
1113 case CallingConv::Cold:
1114 return RetCC_AMDGPU_Func;
1115 default:
1116 report_fatal_error("Unsupported calling convention.");
1117 }
1118 }
1119
1120 /// The SelectionDAGBuilder will automatically promote function arguments
1121 /// with illegal types. However, this does not work for the AMDGPU targets
1122 /// since the function arguments are stored in memory as these illegal types.
1123 /// In order to handle this properly we need to get the original types sizes
1124 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1125 /// passing them to AnalyzeFormalArguments()
1126
1127 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1128 /// input values across multiple registers. Each item in the Ins array
1129 /// represents a single value that will be stored in registers. Ins[x].VT is
1130 /// the value type of the value that will be stored in the register, so
1131 /// whatever SDNode we lower the argument to needs to be this type.
1132 ///
1133 /// In order to correctly lower the arguments we need to know the size of each
1134 /// argument. Since Ins[x].VT gives us the size of the register that will
1135 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1136 /// for the original function argument so that we can deduce the correct memory
1137 /// type to use for Ins[x]. In most cases the correct memory type will be
1138 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1139 /// we have a kernel argument of type v8i8, this argument will be split into
1140 /// 8 parts and each part will be represented by its own item in the Ins array.
1141 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1142 /// the argument before it was split. From this, we deduce that the memory type
1143 /// for each individual part is i8. We pass the memory type as LocVT to the
1144 /// calling convention analysis function and the register type (Ins[x].VT) as
1145 /// the ValVT.
analyzeFormalArgumentsCompute(CCState & State,const SmallVectorImpl<ISD::InputArg> & Ins) const1146 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1147 CCState &State,
1148 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1149 const MachineFunction &MF = State.getMachineFunction();
1150 const Function &Fn = MF.getFunction();
1151 LLVMContext &Ctx = Fn.getParent()->getContext();
1152 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1153 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1154 CallingConv::ID CC = Fn.getCallingConv();
1155
1156 Align MaxAlign = Align(1);
1157 uint64_t ExplicitArgOffset = 0;
1158 const DataLayout &DL = Fn.getDataLayout();
1159
1160 unsigned InIndex = 0;
1161
1162 for (const Argument &Arg : Fn.args()) {
1163 const bool IsByRef = Arg.hasByRefAttr();
1164 Type *BaseArgTy = Arg.getType();
1165 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1166 Align Alignment = DL.getValueOrABITypeAlignment(
1167 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1168 MaxAlign = std::max(Alignment, MaxAlign);
1169 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1170
1171 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1172 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1173
1174 // We're basically throwing away everything passed into us and starting over
1175 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1176 // to us as computed in Ins.
1177 //
1178 // We also need to figure out what type legalization is trying to do to get
1179 // the correct memory offsets.
1180
1181 SmallVector<EVT, 16> ValueVTs;
1182 SmallVector<uint64_t, 16> Offsets;
1183 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1184
1185 for (unsigned Value = 0, NumValues = ValueVTs.size();
1186 Value != NumValues; ++Value) {
1187 uint64_t BasePartOffset = Offsets[Value];
1188
1189 EVT ArgVT = ValueVTs[Value];
1190 EVT MemVT = ArgVT;
1191 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1192 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1193
1194 if (NumRegs == 1) {
1195 // This argument is not split, so the IR type is the memory type.
1196 if (ArgVT.isExtended()) {
1197 // We have an extended type, like i24, so we should just use the
1198 // register type.
1199 MemVT = RegisterVT;
1200 } else {
1201 MemVT = ArgVT;
1202 }
1203 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1204 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1205 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1206 // We have a vector value which has been split into a vector with
1207 // the same scalar type, but fewer elements. This should handle
1208 // all the floating-point vector types.
1209 MemVT = RegisterVT;
1210 } else if (ArgVT.isVector() &&
1211 ArgVT.getVectorNumElements() == NumRegs) {
1212 // This arg has been split so that each element is stored in a separate
1213 // register.
1214 MemVT = ArgVT.getScalarType();
1215 } else if (ArgVT.isExtended()) {
1216 // We have an extended type, like i65.
1217 MemVT = RegisterVT;
1218 } else {
1219 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1220 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1221 if (RegisterVT.isInteger()) {
1222 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1223 } else if (RegisterVT.isVector()) {
1224 assert(!RegisterVT.getScalarType().isFloatingPoint());
1225 unsigned NumElements = RegisterVT.getVectorNumElements();
1226 assert(MemoryBits % NumElements == 0);
1227 // This vector type has been split into another vector type with
1228 // a different elements size.
1229 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1230 MemoryBits / NumElements);
1231 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1232 } else {
1233 llvm_unreachable("cannot deduce memory type.");
1234 }
1235 }
1236
1237 // Convert one element vectors to scalar.
1238 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1239 MemVT = MemVT.getScalarType();
1240
1241 // Round up vec3/vec5 argument.
1242 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1243 assert(MemVT.getVectorNumElements() == 3 ||
1244 MemVT.getVectorNumElements() == 5 ||
1245 (MemVT.getVectorNumElements() >= 9 &&
1246 MemVT.getVectorNumElements() <= 12));
1247 MemVT = MemVT.getPow2VectorType(State.getContext());
1248 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1249 MemVT = MemVT.getRoundIntegerType(State.getContext());
1250 }
1251
1252 unsigned PartOffset = 0;
1253 for (unsigned i = 0; i != NumRegs; ++i) {
1254 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1255 BasePartOffset + PartOffset,
1256 MemVT.getSimpleVT(),
1257 CCValAssign::Full));
1258 PartOffset += MemVT.getStoreSize();
1259 }
1260 }
1261 }
1262 }
1263
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & DL,SelectionDAG & DAG) const1264 SDValue AMDGPUTargetLowering::LowerReturn(
1265 SDValue Chain, CallingConv::ID CallConv,
1266 bool isVarArg,
1267 const SmallVectorImpl<ISD::OutputArg> &Outs,
1268 const SmallVectorImpl<SDValue> &OutVals,
1269 const SDLoc &DL, SelectionDAG &DAG) const {
1270 // FIXME: Fails for r600 tests
1271 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1272 // "wave terminate should not have return values");
1273 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1274 }
1275
1276 //===---------------------------------------------------------------------===//
1277 // Target specific lowering
1278 //===---------------------------------------------------------------------===//
1279
1280 /// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFnForCall(CallingConv::ID CC,bool IsVarArg)1281 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1282 bool IsVarArg) {
1283 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1284 }
1285
CCAssignFnForReturn(CallingConv::ID CC,bool IsVarArg)1286 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1287 bool IsVarArg) {
1288 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1289 }
1290
addTokenForArgument(SDValue Chain,SelectionDAG & DAG,MachineFrameInfo & MFI,int ClobberedFI) const1291 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1292 SelectionDAG &DAG,
1293 MachineFrameInfo &MFI,
1294 int ClobberedFI) const {
1295 SmallVector<SDValue, 8> ArgChains;
1296 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1297 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1298
1299 // Include the original chain at the beginning of the list. When this is
1300 // used by target LowerCall hooks, this helps legalize find the
1301 // CALLSEQ_BEGIN node.
1302 ArgChains.push_back(Chain);
1303
1304 // Add a chain value for each stack argument corresponding
1305 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1306 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1307 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1308 if (FI->getIndex() < 0) {
1309 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1310 int64_t InLastByte = InFirstByte;
1311 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1312
1313 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1314 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1315 ArgChains.push_back(SDValue(L, 1));
1316 }
1317 }
1318 }
1319 }
1320
1321 // Build a tokenfactor for all the chains.
1322 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1323 }
1324
lowerUnhandledCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals,StringRef Reason) const1325 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1326 SmallVectorImpl<SDValue> &InVals,
1327 StringRef Reason) const {
1328 SDValue Callee = CLI.Callee;
1329 SelectionDAG &DAG = CLI.DAG;
1330
1331 const Function &Fn = DAG.getMachineFunction().getFunction();
1332
1333 StringRef FuncName("<unknown>");
1334
1335 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1336 FuncName = G->getSymbol();
1337 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1338 FuncName = G->getGlobal()->getName();
1339
1340 DiagnosticInfoUnsupported NoCalls(
1341 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1342 DAG.getContext()->diagnose(NoCalls);
1343
1344 if (!CLI.IsTailCall) {
1345 for (ISD::InputArg &Arg : CLI.Ins)
1346 InVals.push_back(DAG.getUNDEF(Arg.VT));
1347 }
1348
1349 return DAG.getEntryNode();
1350 }
1351
LowerCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const1352 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1353 SmallVectorImpl<SDValue> &InVals) const {
1354 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1355 }
1356
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const1357 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1358 SelectionDAG &DAG) const {
1359 const Function &Fn = DAG.getMachineFunction().getFunction();
1360
1361 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1362 SDLoc(Op).getDebugLoc());
1363 DAG.getContext()->diagnose(NoDynamicAlloca);
1364 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1365 return DAG.getMergeValues(Ops, SDLoc());
1366 }
1367
LowerOperation(SDValue Op,SelectionDAG & DAG) const1368 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1369 SelectionDAG &DAG) const {
1370 switch (Op.getOpcode()) {
1371 default:
1372 Op->print(errs(), &DAG);
1373 llvm_unreachable("Custom lowering code for this "
1374 "instruction is not implemented yet!");
1375 break;
1376 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1377 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1378 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1379 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1380 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1381 case ISD::FREM: return LowerFREM(Op, DAG);
1382 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1383 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1384 case ISD::FRINT: return LowerFRINT(Op, DAG);
1385 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1386 case ISD::FROUNDEVEN:
1387 return LowerFROUNDEVEN(Op, DAG);
1388 case ISD::FROUND: return LowerFROUND(Op, DAG);
1389 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1390 case ISD::FLOG2:
1391 return LowerFLOG2(Op, DAG);
1392 case ISD::FLOG:
1393 case ISD::FLOG10:
1394 return LowerFLOGCommon(Op, DAG);
1395 case ISD::FEXP:
1396 case ISD::FEXP10:
1397 return lowerFEXP(Op, DAG);
1398 case ISD::FEXP2:
1399 return lowerFEXP2(Op, DAG);
1400 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1401 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1402 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1403 case ISD::FP_TO_SINT:
1404 case ISD::FP_TO_UINT:
1405 return LowerFP_TO_INT(Op, DAG);
1406 case ISD::CTTZ:
1407 case ISD::CTTZ_ZERO_UNDEF:
1408 case ISD::CTLZ:
1409 case ISD::CTLZ_ZERO_UNDEF:
1410 return LowerCTLZ_CTTZ(Op, DAG);
1411 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1412 }
1413 return Op;
1414 }
1415
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const1416 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1417 SmallVectorImpl<SDValue> &Results,
1418 SelectionDAG &DAG) const {
1419 switch (N->getOpcode()) {
1420 case ISD::SIGN_EXTEND_INREG:
1421 // Different parts of legalization seem to interpret which type of
1422 // sign_extend_inreg is the one to check for custom lowering. The extended
1423 // from type is what really matters, but some places check for custom
1424 // lowering of the result type. This results in trying to use
1425 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1426 // nothing here and let the illegal result integer be handled normally.
1427 return;
1428 case ISD::FLOG2:
1429 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1430 Results.push_back(Lowered);
1431 return;
1432 case ISD::FLOG:
1433 case ISD::FLOG10:
1434 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1435 Results.push_back(Lowered);
1436 return;
1437 case ISD::FEXP2:
1438 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1439 Results.push_back(Lowered);
1440 return;
1441 case ISD::FEXP:
1442 case ISD::FEXP10:
1443 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1444 Results.push_back(Lowered);
1445 return;
1446 case ISD::CTLZ:
1447 case ISD::CTLZ_ZERO_UNDEF:
1448 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1449 Results.push_back(Lowered);
1450 return;
1451 default:
1452 return;
1453 }
1454 }
1455
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const1456 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1457 SDValue Op,
1458 SelectionDAG &DAG) const {
1459
1460 const DataLayout &DL = DAG.getDataLayout();
1461 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1462 const GlobalValue *GV = G->getGlobal();
1463
1464 if (!MFI->isModuleEntryFunction()) {
1465 if (std::optional<uint32_t> Address =
1466 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1467 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1468 }
1469 }
1470
1471 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1472 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1473 if (!MFI->isModuleEntryFunction() &&
1474 GV->getName() != "llvm.amdgcn.module.lds") {
1475 SDLoc DL(Op);
1476 const Function &Fn = DAG.getMachineFunction().getFunction();
1477 DiagnosticInfoUnsupported BadLDSDecl(
1478 Fn, "local memory global used by non-kernel function",
1479 DL.getDebugLoc(), DS_Warning);
1480 DAG.getContext()->diagnose(BadLDSDecl);
1481
1482 // We currently don't have a way to correctly allocate LDS objects that
1483 // aren't directly associated with a kernel. We do force inlining of
1484 // functions that use local objects. However, if these dead functions are
1485 // not eliminated, we don't want a compile time error. Just emit a warning
1486 // and a trap, since there should be no callable path here.
1487 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1488 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1489 Trap, DAG.getRoot());
1490 DAG.setRoot(OutputChain);
1491 return DAG.getUNDEF(Op.getValueType());
1492 }
1493
1494 // XXX: What does the value of G->getOffset() mean?
1495 assert(G->getOffset() == 0 &&
1496 "Do not know what to do with an non-zero offset");
1497
1498 // TODO: We could emit code to handle the initialization somewhere.
1499 // We ignore the initializer for now and legalize it to allow selection.
1500 // The initializer will anyway get errored out during assembly emission.
1501 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1502 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1503 }
1504 return SDValue();
1505 }
1506
LowerCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG) const1507 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1508 SelectionDAG &DAG) const {
1509 SmallVector<SDValue, 8> Args;
1510 SDLoc SL(Op);
1511
1512 EVT VT = Op.getValueType();
1513 if (VT.getVectorElementType().getSizeInBits() < 32) {
1514 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1515 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1516 unsigned NewNumElt = OpBitSize / 32;
1517 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1518 : EVT::getVectorVT(*DAG.getContext(),
1519 MVT::i32, NewNumElt);
1520 for (const SDUse &U : Op->ops()) {
1521 SDValue In = U.get();
1522 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1523 if (NewNumElt > 1)
1524 DAG.ExtractVectorElements(NewIn, Args);
1525 else
1526 Args.push_back(NewIn);
1527 }
1528
1529 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1530 NewNumElt * Op.getNumOperands());
1531 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1532 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1533 }
1534 }
1535
1536 for (const SDUse &U : Op->ops())
1537 DAG.ExtractVectorElements(U.get(), Args);
1538
1539 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1540 }
1541
LowerEXTRACT_SUBVECTOR(SDValue Op,SelectionDAG & DAG) const1542 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1543 SelectionDAG &DAG) const {
1544 SDLoc SL(Op);
1545 SmallVector<SDValue, 8> Args;
1546 unsigned Start = Op.getConstantOperandVal(1);
1547 EVT VT = Op.getValueType();
1548 EVT SrcVT = Op.getOperand(0).getValueType();
1549
1550 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1551 unsigned NumElt = VT.getVectorNumElements();
1552 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1553 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1554
1555 // Extract 32-bit registers at a time.
1556 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1557 EVT NewVT = NumElt == 2
1558 ? MVT::i32
1559 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1560 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1561
1562 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1563 if (NumElt == 2)
1564 Tmp = Args[0];
1565 else
1566 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1567
1568 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1569 }
1570
1571 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1572 VT.getVectorNumElements());
1573
1574 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1575 }
1576
1577 // TODO: Handle fabs too
peekFNeg(SDValue Val)1578 static SDValue peekFNeg(SDValue Val) {
1579 if (Val.getOpcode() == ISD::FNEG)
1580 return Val.getOperand(0);
1581
1582 return Val;
1583 }
1584
peekFPSignOps(SDValue Val)1585 static SDValue peekFPSignOps(SDValue Val) {
1586 if (Val.getOpcode() == ISD::FNEG)
1587 Val = Val.getOperand(0);
1588 if (Val.getOpcode() == ISD::FABS)
1589 Val = Val.getOperand(0);
1590 if (Val.getOpcode() == ISD::FCOPYSIGN)
1591 Val = Val.getOperand(0);
1592 return Val;
1593 }
1594
combineFMinMaxLegacyImpl(const SDLoc & DL,EVT VT,SDValue LHS,SDValue RHS,SDValue True,SDValue False,SDValue CC,DAGCombinerInfo & DCI) const1595 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1596 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1597 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1598 SelectionDAG &DAG = DCI.DAG;
1599 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1600 switch (CCOpcode) {
1601 case ISD::SETOEQ:
1602 case ISD::SETONE:
1603 case ISD::SETUNE:
1604 case ISD::SETNE:
1605 case ISD::SETUEQ:
1606 case ISD::SETEQ:
1607 case ISD::SETFALSE:
1608 case ISD::SETFALSE2:
1609 case ISD::SETTRUE:
1610 case ISD::SETTRUE2:
1611 case ISD::SETUO:
1612 case ISD::SETO:
1613 break;
1614 case ISD::SETULE:
1615 case ISD::SETULT: {
1616 if (LHS == True)
1617 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1618 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1619 }
1620 case ISD::SETOLE:
1621 case ISD::SETOLT:
1622 case ISD::SETLE:
1623 case ISD::SETLT: {
1624 // Ordered. Assume ordered for undefined.
1625
1626 // Only do this after legalization to avoid interfering with other combines
1627 // which might occur.
1628 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1629 !DCI.isCalledByLegalizer())
1630 return SDValue();
1631
1632 // We need to permute the operands to get the correct NaN behavior. The
1633 // selected operand is the second one based on the failing compare with NaN,
1634 // so permute it based on the compare type the hardware uses.
1635 if (LHS == True)
1636 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1637 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1638 }
1639 case ISD::SETUGE:
1640 case ISD::SETUGT: {
1641 if (LHS == True)
1642 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1643 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1644 }
1645 case ISD::SETGT:
1646 case ISD::SETGE:
1647 case ISD::SETOGE:
1648 case ISD::SETOGT: {
1649 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1650 !DCI.isCalledByLegalizer())
1651 return SDValue();
1652
1653 if (LHS == True)
1654 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1655 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1656 }
1657 case ISD::SETCC_INVALID:
1658 llvm_unreachable("Invalid setcc condcode!");
1659 }
1660 return SDValue();
1661 }
1662
1663 /// Generate Min/Max node
combineFMinMaxLegacy(const SDLoc & DL,EVT VT,SDValue LHS,SDValue RHS,SDValue True,SDValue False,SDValue CC,DAGCombinerInfo & DCI) const1664 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1665 SDValue LHS, SDValue RHS,
1666 SDValue True, SDValue False,
1667 SDValue CC,
1668 DAGCombinerInfo &DCI) const {
1669 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1670 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1671
1672 SelectionDAG &DAG = DCI.DAG;
1673
1674 // If we can't directly match this, try to see if we can fold an fneg to
1675 // match.
1676
1677 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1678 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1679 SDValue NegTrue = peekFNeg(True);
1680
1681 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1682 // fmin/fmax.
1683 //
1684 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1685 // -> fneg (fmin_legacy lhs, K)
1686 //
1687 // TODO: Use getNegatedExpression
1688 if (LHS == NegTrue && CFalse && CRHS) {
1689 APFloat NegRHS = neg(CRHS->getValueAPF());
1690 if (NegRHS == CFalse->getValueAPF()) {
1691 SDValue Combined =
1692 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1693 if (Combined)
1694 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1695 return SDValue();
1696 }
1697 }
1698
1699 return SDValue();
1700 }
1701
1702 std::pair<SDValue, SDValue>
split64BitValue(SDValue Op,SelectionDAG & DAG) const1703 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1704 SDLoc SL(Op);
1705
1706 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1707
1708 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1709 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1710
1711 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1712 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1713
1714 return std::pair(Lo, Hi);
1715 }
1716
getLoHalf64(SDValue Op,SelectionDAG & DAG) const1717 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1718 SDLoc SL(Op);
1719
1720 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1721 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1722 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1723 }
1724
getHiHalf64(SDValue Op,SelectionDAG & DAG) const1725 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1726 SDLoc SL(Op);
1727
1728 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1729 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1730 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1731 }
1732
1733 // Split a vector type into two parts. The first part is a power of two vector.
1734 // The second part is whatever is left over, and is a scalar if it would
1735 // otherwise be a 1-vector.
1736 std::pair<EVT, EVT>
getSplitDestVTs(const EVT & VT,SelectionDAG & DAG) const1737 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1738 EVT LoVT, HiVT;
1739 EVT EltVT = VT.getVectorElementType();
1740 unsigned NumElts = VT.getVectorNumElements();
1741 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1742 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1743 HiVT = NumElts - LoNumElts == 1
1744 ? EltVT
1745 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1746 return std::pair(LoVT, HiVT);
1747 }
1748
1749 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1750 // scalar.
1751 std::pair<SDValue, SDValue>
splitVector(const SDValue & N,const SDLoc & DL,const EVT & LoVT,const EVT & HiVT,SelectionDAG & DAG) const1752 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1753 const EVT &LoVT, const EVT &HiVT,
1754 SelectionDAG &DAG) const {
1755 assert(LoVT.getVectorNumElements() +
1756 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1757 N.getValueType().getVectorNumElements() &&
1758 "More vector elements requested than available!");
1759 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1760 DAG.getVectorIdxConstant(0, DL));
1761 SDValue Hi = DAG.getNode(
1762 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1763 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1764 return std::pair(Lo, Hi);
1765 }
1766
SplitVectorLoad(const SDValue Op,SelectionDAG & DAG) const1767 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1768 SelectionDAG &DAG) const {
1769 LoadSDNode *Load = cast<LoadSDNode>(Op);
1770 EVT VT = Op.getValueType();
1771 SDLoc SL(Op);
1772
1773
1774 // If this is a 2 element vector, we really want to scalarize and not create
1775 // weird 1 element vectors.
1776 if (VT.getVectorNumElements() == 2) {
1777 SDValue Ops[2];
1778 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1779 return DAG.getMergeValues(Ops, SL);
1780 }
1781
1782 SDValue BasePtr = Load->getBasePtr();
1783 EVT MemVT = Load->getMemoryVT();
1784
1785 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1786
1787 EVT LoVT, HiVT;
1788 EVT LoMemVT, HiMemVT;
1789 SDValue Lo, Hi;
1790
1791 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1792 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1793 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1794
1795 unsigned Size = LoMemVT.getStoreSize();
1796 Align BaseAlign = Load->getAlign();
1797 Align HiAlign = commonAlignment(BaseAlign, Size);
1798
1799 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1800 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1801 BaseAlign, Load->getMemOperand()->getFlags());
1802 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1803 SDValue HiLoad =
1804 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1805 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1806 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1807
1808 SDValue Join;
1809 if (LoVT == HiVT) {
1810 // This is the case that the vector is power of two so was evenly split.
1811 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1812 } else {
1813 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1814 DAG.getVectorIdxConstant(0, SL));
1815 Join = DAG.getNode(
1816 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1817 VT, Join, HiLoad,
1818 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1819 }
1820
1821 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1822 LoLoad.getValue(1), HiLoad.getValue(1))};
1823
1824 return DAG.getMergeValues(Ops, SL);
1825 }
1826
WidenOrSplitVectorLoad(SDValue Op,SelectionDAG & DAG) const1827 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1828 SelectionDAG &DAG) const {
1829 LoadSDNode *Load = cast<LoadSDNode>(Op);
1830 EVT VT = Op.getValueType();
1831 SDValue BasePtr = Load->getBasePtr();
1832 EVT MemVT = Load->getMemoryVT();
1833 SDLoc SL(Op);
1834 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1835 Align BaseAlign = Load->getAlign();
1836 unsigned NumElements = MemVT.getVectorNumElements();
1837
1838 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1839 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1840 if (NumElements != 3 ||
1841 (BaseAlign < Align(8) &&
1842 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1843 return SplitVectorLoad(Op, DAG);
1844
1845 assert(NumElements == 3);
1846
1847 EVT WideVT =
1848 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1849 EVT WideMemVT =
1850 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1851 SDValue WideLoad = DAG.getExtLoad(
1852 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1853 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1854 return DAG.getMergeValues(
1855 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1856 DAG.getVectorIdxConstant(0, SL)),
1857 WideLoad.getValue(1)},
1858 SL);
1859 }
1860
SplitVectorStore(SDValue Op,SelectionDAG & DAG) const1861 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1862 SelectionDAG &DAG) const {
1863 StoreSDNode *Store = cast<StoreSDNode>(Op);
1864 SDValue Val = Store->getValue();
1865 EVT VT = Val.getValueType();
1866
1867 // If this is a 2 element vector, we really want to scalarize and not create
1868 // weird 1 element vectors.
1869 if (VT.getVectorNumElements() == 2)
1870 return scalarizeVectorStore(Store, DAG);
1871
1872 EVT MemVT = Store->getMemoryVT();
1873 SDValue Chain = Store->getChain();
1874 SDValue BasePtr = Store->getBasePtr();
1875 SDLoc SL(Op);
1876
1877 EVT LoVT, HiVT;
1878 EVT LoMemVT, HiMemVT;
1879 SDValue Lo, Hi;
1880
1881 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1882 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1883 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1884
1885 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1886
1887 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1888 Align BaseAlign = Store->getAlign();
1889 unsigned Size = LoMemVT.getStoreSize();
1890 Align HiAlign = commonAlignment(BaseAlign, Size);
1891
1892 SDValue LoStore =
1893 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1894 Store->getMemOperand()->getFlags());
1895 SDValue HiStore =
1896 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1897 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1898
1899 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1900 }
1901
1902 // This is a shortcut for integer division because we have fast i32<->f32
1903 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1904 // float is enough to accurately represent up to a 24-bit signed integer.
LowerDIVREM24(SDValue Op,SelectionDAG & DAG,bool Sign) const1905 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1906 bool Sign) const {
1907 SDLoc DL(Op);
1908 EVT VT = Op.getValueType();
1909 SDValue LHS = Op.getOperand(0);
1910 SDValue RHS = Op.getOperand(1);
1911 MVT IntVT = MVT::i32;
1912 MVT FltVT = MVT::f32;
1913
1914 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1915 if (LHSSignBits < 9)
1916 return SDValue();
1917
1918 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1919 if (RHSSignBits < 9)
1920 return SDValue();
1921
1922 unsigned BitSize = VT.getSizeInBits();
1923 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1924 unsigned DivBits = BitSize - SignBits;
1925 if (Sign)
1926 ++DivBits;
1927
1928 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1929 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1930
1931 SDValue jq = DAG.getConstant(1, DL, IntVT);
1932
1933 if (Sign) {
1934 // char|short jq = ia ^ ib;
1935 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1936
1937 // jq = jq >> (bitsize - 2)
1938 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1939 DAG.getConstant(BitSize - 2, DL, VT));
1940
1941 // jq = jq | 0x1
1942 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1943 }
1944
1945 // int ia = (int)LHS;
1946 SDValue ia = LHS;
1947
1948 // int ib, (int)RHS;
1949 SDValue ib = RHS;
1950
1951 // float fa = (float)ia;
1952 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1953
1954 // float fb = (float)ib;
1955 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1956
1957 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1958 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1959
1960 // fq = trunc(fq);
1961 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1962
1963 // float fqneg = -fq;
1964 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1965
1966 MachineFunction &MF = DAG.getMachineFunction();
1967
1968 bool UseFmadFtz = false;
1969 if (Subtarget->isGCN()) {
1970 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1971 UseFmadFtz =
1972 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1973 }
1974
1975 // float fr = mad(fqneg, fb, fa);
1976 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1977 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1978 : (unsigned)ISD::FMAD;
1979 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1980
1981 // int iq = (int)fq;
1982 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1983
1984 // fr = fabs(fr);
1985 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1986
1987 // fb = fabs(fb);
1988 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1989
1990 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1991
1992 // int cv = fr >= fb;
1993 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1994
1995 // jq = (cv ? jq : 0);
1996 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1997
1998 // dst = iq + jq;
1999 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2000
2001 // Rem needs compensation, it's easier to recompute it
2002 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2003 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2004
2005 // Truncate to number of bits this divide really is.
2006 if (Sign) {
2007 SDValue InRegSize
2008 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2009 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2010 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2011 } else {
2012 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2013 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2014 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2015 }
2016
2017 return DAG.getMergeValues({ Div, Rem }, DL);
2018 }
2019
LowerUDIVREM64(SDValue Op,SelectionDAG & DAG,SmallVectorImpl<SDValue> & Results) const2020 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2021 SelectionDAG &DAG,
2022 SmallVectorImpl<SDValue> &Results) const {
2023 SDLoc DL(Op);
2024 EVT VT = Op.getValueType();
2025
2026 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2027
2028 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2029
2030 SDValue One = DAG.getConstant(1, DL, HalfVT);
2031 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2032
2033 //HiLo split
2034 SDValue LHS_Lo, LHS_Hi;
2035 SDValue LHS = Op.getOperand(0);
2036 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2037
2038 SDValue RHS_Lo, RHS_Hi;
2039 SDValue RHS = Op.getOperand(1);
2040 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2041
2042 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2043 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2044
2045 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2046 LHS_Lo, RHS_Lo);
2047
2048 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2049 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2050
2051 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2052 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2053 return;
2054 }
2055
2056 if (isTypeLegal(MVT::i64)) {
2057 // The algorithm here is based on ideas from "Software Integer Division",
2058 // Tom Rodeheffer, August 2008.
2059
2060 MachineFunction &MF = DAG.getMachineFunction();
2061 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062
2063 // Compute denominator reciprocal.
2064 unsigned FMAD =
2065 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2066 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2067 ? (unsigned)ISD::FMAD
2068 : (unsigned)AMDGPUISD::FMAD_FTZ;
2069
2070 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2071 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2072 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2073 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2074 Cvt_Lo);
2075 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2076 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2077 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2078 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2079 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2080 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2081 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2082 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2083 Mul1);
2084 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2085 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2086 SDValue Rcp64 = DAG.getBitcast(VT,
2087 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2088
2089 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2090 SDValue One64 = DAG.getConstant(1, DL, VT);
2091 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2092 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2093
2094 // First round of UNR (Unsigned integer Newton-Raphson).
2095 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2096 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2097 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2098 SDValue Mulhi1_Lo, Mulhi1_Hi;
2099 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2100 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2101 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2102 Mulhi1_Lo, Zero1);
2103 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2104 Mulhi1_Hi, Add1_Lo.getValue(1));
2105 SDValue Add1 = DAG.getBitcast(VT,
2106 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2107
2108 // Second round of UNR.
2109 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2110 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2111 SDValue Mulhi2_Lo, Mulhi2_Hi;
2112 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2113 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2114 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2115 Mulhi2_Lo, Zero1);
2116 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2117 Mulhi2_Hi, Add2_Lo.getValue(1));
2118 SDValue Add2 = DAG.getBitcast(VT,
2119 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2120
2121 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2122
2123 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2124
2125 SDValue Mul3_Lo, Mul3_Hi;
2126 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2127 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2128 Mul3_Lo, Zero1);
2129 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2130 Mul3_Hi, Sub1_Lo.getValue(1));
2131 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2132 SDValue Sub1 = DAG.getBitcast(VT,
2133 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2134
2135 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2136 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2137 ISD::SETUGE);
2138 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2139 ISD::SETUGE);
2140 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2141
2142 // TODO: Here and below portions of the code can be enclosed into if/endif.
2143 // Currently control flow is unconditional and we have 4 selects after
2144 // potential endif to substitute PHIs.
2145
2146 // if C3 != 0 ...
2147 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2148 RHS_Lo, Zero1);
2149 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2150 RHS_Hi, Sub1_Lo.getValue(1));
2151 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2152 Zero, Sub2_Lo.getValue(1));
2153 SDValue Sub2 = DAG.getBitcast(VT,
2154 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2155
2156 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2157
2158 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2159 ISD::SETUGE);
2160 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2161 ISD::SETUGE);
2162 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2163
2164 // if (C6 != 0)
2165 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2166
2167 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2168 RHS_Lo, Zero1);
2169 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2170 RHS_Hi, Sub2_Lo.getValue(1));
2171 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2172 Zero, Sub3_Lo.getValue(1));
2173 SDValue Sub3 = DAG.getBitcast(VT,
2174 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2175
2176 // endif C6
2177 // endif C3
2178
2179 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2180 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2181
2182 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2183 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2184
2185 Results.push_back(Div);
2186 Results.push_back(Rem);
2187
2188 return;
2189 }
2190
2191 // r600 expandion.
2192 // Get Speculative values
2193 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2194 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2195
2196 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2197 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2198 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2199
2200 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2201 SDValue DIV_Lo = Zero;
2202
2203 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2204
2205 for (unsigned i = 0; i < halfBitWidth; ++i) {
2206 const unsigned bitPos = halfBitWidth - i - 1;
2207 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2208 // Get value of high bit
2209 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2210 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2211 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2212
2213 // Shift
2214 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2215 // Add LHS high bit
2216 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2217
2218 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2219 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2220
2221 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2222
2223 // Update REM
2224 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2225 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2226 }
2227
2228 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2229 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2230 Results.push_back(DIV);
2231 Results.push_back(REM);
2232 }
2233
LowerUDIVREM(SDValue Op,SelectionDAG & DAG) const2234 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2235 SelectionDAG &DAG) const {
2236 SDLoc DL(Op);
2237 EVT VT = Op.getValueType();
2238
2239 if (VT == MVT::i64) {
2240 SmallVector<SDValue, 2> Results;
2241 LowerUDIVREM64(Op, DAG, Results);
2242 return DAG.getMergeValues(Results, DL);
2243 }
2244
2245 if (VT == MVT::i32) {
2246 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2247 return Res;
2248 }
2249
2250 SDValue X = Op.getOperand(0);
2251 SDValue Y = Op.getOperand(1);
2252
2253 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2254 // algorithm used here.
2255
2256 // Initial estimate of inv(y).
2257 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2258
2259 // One round of UNR.
2260 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2261 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2262 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2263 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2264
2265 // Quotient/remainder estimate.
2266 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2267 SDValue R =
2268 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2269
2270 // First quotient/remainder refinement.
2271 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2272 SDValue One = DAG.getConstant(1, DL, VT);
2273 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2274 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2275 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2276 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2277 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2278
2279 // Second quotient/remainder refinement.
2280 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2281 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2282 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2283 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2284 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2285
2286 return DAG.getMergeValues({Q, R}, DL);
2287 }
2288
LowerSDIVREM(SDValue Op,SelectionDAG & DAG) const2289 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2290 SelectionDAG &DAG) const {
2291 SDLoc DL(Op);
2292 EVT VT = Op.getValueType();
2293
2294 SDValue LHS = Op.getOperand(0);
2295 SDValue RHS = Op.getOperand(1);
2296
2297 SDValue Zero = DAG.getConstant(0, DL, VT);
2298 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2299
2300 if (VT == MVT::i32) {
2301 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2302 return Res;
2303 }
2304
2305 if (VT == MVT::i64 &&
2306 DAG.ComputeNumSignBits(LHS) > 32 &&
2307 DAG.ComputeNumSignBits(RHS) > 32) {
2308 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2309
2310 //HiLo split
2311 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2312 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2313 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2314 LHS_Lo, RHS_Lo);
2315 SDValue Res[2] = {
2316 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2317 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2318 };
2319 return DAG.getMergeValues(Res, DL);
2320 }
2321
2322 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2323 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2324 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2325 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2326
2327 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2328 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2329
2330 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2331 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2332
2333 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2334 SDValue Rem = Div.getValue(1);
2335
2336 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2337 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2338
2339 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2340 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2341
2342 SDValue Res[2] = {
2343 Div,
2344 Rem
2345 };
2346 return DAG.getMergeValues(Res, DL);
2347 }
2348
2349 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
LowerFREM(SDValue Op,SelectionDAG & DAG) const2350 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2351 SDLoc SL(Op);
2352 EVT VT = Op.getValueType();
2353 auto Flags = Op->getFlags();
2354 SDValue X = Op.getOperand(0);
2355 SDValue Y = Op.getOperand(1);
2356
2357 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2358 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2359 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2360 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2361 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2362 }
2363
LowerFCEIL(SDValue Op,SelectionDAG & DAG) const2364 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2365 SDLoc SL(Op);
2366 SDValue Src = Op.getOperand(0);
2367
2368 // result = trunc(src)
2369 // if (src > 0.0 && src != result)
2370 // result += 1.0
2371
2372 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2373
2374 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2375 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2376
2377 EVT SetCCVT =
2378 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2379
2380 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2381 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2382 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2383
2384 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2385 // TODO: Should this propagate fast-math-flags?
2386 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2387 }
2388
extractF64Exponent(SDValue Hi,const SDLoc & SL,SelectionDAG & DAG)2389 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2390 SelectionDAG &DAG) {
2391 const unsigned FractBits = 52;
2392 const unsigned ExpBits = 11;
2393
2394 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2395 Hi,
2396 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2397 DAG.getConstant(ExpBits, SL, MVT::i32));
2398 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2399 DAG.getConstant(1023, SL, MVT::i32));
2400
2401 return Exp;
2402 }
2403
LowerFTRUNC(SDValue Op,SelectionDAG & DAG) const2404 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2405 SDLoc SL(Op);
2406 SDValue Src = Op.getOperand(0);
2407
2408 assert(Op.getValueType() == MVT::f64);
2409
2410 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2411
2412 // Extract the upper half, since this is where we will find the sign and
2413 // exponent.
2414 SDValue Hi = getHiHalf64(Src, DAG);
2415
2416 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2417
2418 const unsigned FractBits = 52;
2419
2420 // Extract the sign bit.
2421 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2422 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2423
2424 // Extend back to 64-bits.
2425 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2426 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2427
2428 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2429 const SDValue FractMask
2430 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2431
2432 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2433 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2434 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2435
2436 EVT SetCCVT =
2437 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2438
2439 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2440
2441 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2442 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2443
2444 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2445 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2446
2447 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2448 }
2449
LowerFROUNDEVEN(SDValue Op,SelectionDAG & DAG) const2450 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2451 SelectionDAG &DAG) const {
2452 SDLoc SL(Op);
2453 SDValue Src = Op.getOperand(0);
2454
2455 assert(Op.getValueType() == MVT::f64);
2456
2457 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2458 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2459 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2460
2461 // TODO: Should this propagate fast-math-flags?
2462
2463 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2464 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2465
2466 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2467
2468 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2469 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2470
2471 EVT SetCCVT =
2472 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2473 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2474
2475 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2476 }
2477
LowerFNEARBYINT(SDValue Op,SelectionDAG & DAG) const2478 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2479 SelectionDAG &DAG) const {
2480 // FNEARBYINT and FRINT are the same, except in their handling of FP
2481 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2482 // rint, so just treat them as equivalent.
2483 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2484 Op.getOperand(0));
2485 }
2486
LowerFRINT(SDValue Op,SelectionDAG & DAG) const2487 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2488 auto VT = Op.getValueType();
2489 auto Arg = Op.getOperand(0u);
2490 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2491 }
2492
2493 // XXX - May require not supporting f32 denormals?
2494
2495 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2496 // compare and vselect end up producing worse code than scalarizing the whole
2497 // operation.
LowerFROUND(SDValue Op,SelectionDAG & DAG) const2498 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2499 SDLoc SL(Op);
2500 SDValue X = Op.getOperand(0);
2501 EVT VT = Op.getValueType();
2502
2503 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2504
2505 // TODO: Should this propagate fast-math-flags?
2506
2507 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2508
2509 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2510
2511 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2512 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2513
2514 EVT SetCCVT =
2515 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2516
2517 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2518 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2519 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2520
2521 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2522 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2523 }
2524
LowerFFLOOR(SDValue Op,SelectionDAG & DAG) const2525 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2526 SDLoc SL(Op);
2527 SDValue Src = Op.getOperand(0);
2528
2529 // result = trunc(src);
2530 // if (src < 0.0 && src != result)
2531 // result += -1.0.
2532
2533 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2534
2535 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2536 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2537
2538 EVT SetCCVT =
2539 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2540
2541 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2542 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2543 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2544
2545 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2546 // TODO: Should this propagate fast-math-flags?
2547 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2548 }
2549
2550 /// Return true if it's known that \p Src can never be an f32 denormal value.
valueIsKnownNeverF32Denorm(SDValue Src)2551 static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2552 switch (Src.getOpcode()) {
2553 case ISD::FP_EXTEND:
2554 return Src.getOperand(0).getValueType() == MVT::f16;
2555 case ISD::FP16_TO_FP:
2556 case ISD::FFREXP:
2557 return true;
2558 case ISD::INTRINSIC_WO_CHAIN: {
2559 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2560 switch (IntrinsicID) {
2561 case Intrinsic::amdgcn_frexp_mant:
2562 return true;
2563 default:
2564 return false;
2565 }
2566 }
2567 default:
2568 return false;
2569 }
2570
2571 llvm_unreachable("covered opcode switch");
2572 }
2573
allowApproxFunc(const SelectionDAG & DAG,SDNodeFlags Flags)2574 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2575 SDNodeFlags Flags) {
2576 if (Flags.hasApproximateFuncs())
2577 return true;
2578 auto &Options = DAG.getTarget().Options;
2579 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2580 }
2581
needsDenormHandlingF32(const SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags)2582 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2583 SDValue Src,
2584 SDNodeFlags Flags) {
2585 return !valueIsKnownNeverF32Denorm(Src) &&
2586 DAG.getMachineFunction()
2587 .getDenormalMode(APFloat::IEEEsingle())
2588 .Input != DenormalMode::PreserveSign;
2589 }
2590
getIsLtSmallestNormal(SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags) const2591 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2592 SDValue Src,
2593 SDNodeFlags Flags) const {
2594 SDLoc SL(Src);
2595 EVT VT = Src.getValueType();
2596 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2597 SDValue SmallestNormal =
2598 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2599
2600 // Want to scale denormals up, but negatives and 0 work just as well on the
2601 // scaled path.
2602 SDValue IsLtSmallestNormal = DAG.getSetCC(
2603 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2604 SmallestNormal, ISD::SETOLT);
2605
2606 return IsLtSmallestNormal;
2607 }
2608
getIsFinite(SelectionDAG & DAG,SDValue Src,SDNodeFlags Flags) const2609 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2610 SDNodeFlags Flags) const {
2611 SDLoc SL(Src);
2612 EVT VT = Src.getValueType();
2613 const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2614 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2615
2616 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2617 SDValue IsFinite = DAG.getSetCC(
2618 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2619 Inf, ISD::SETOLT);
2620 return IsFinite;
2621 }
2622
2623 /// If denormal handling is required return the scaled input to FLOG2, and the
2624 /// check for denormal range. Otherwise, return null values.
2625 std::pair<SDValue, SDValue>
getScaledLogInput(SelectionDAG & DAG,const SDLoc SL,SDValue Src,SDNodeFlags Flags) const2626 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2627 SDValue Src, SDNodeFlags Flags) const {
2628 if (!needsDenormHandlingF32(DAG, Src, Flags))
2629 return {};
2630
2631 MVT VT = MVT::f32;
2632 const fltSemantics &Semantics = APFloat::IEEEsingle();
2633 SDValue SmallestNormal =
2634 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2635
2636 SDValue IsLtSmallestNormal = DAG.getSetCC(
2637 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2638 SmallestNormal, ISD::SETOLT);
2639
2640 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2641 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2642 SDValue ScaleFactor =
2643 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2644
2645 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2646 return {ScaledInput, IsLtSmallestNormal};
2647 }
2648
LowerFLOG2(SDValue Op,SelectionDAG & DAG) const2649 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2650 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2651 // If we have to handle denormals, scale up the input and adjust the result.
2652
2653 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2654 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2655
2656 SDLoc SL(Op);
2657 EVT VT = Op.getValueType();
2658 SDValue Src = Op.getOperand(0);
2659 SDNodeFlags Flags = Op->getFlags();
2660
2661 if (VT == MVT::f16) {
2662 // Nothing in half is a denormal when promoted to f32.
2663 assert(!Subtarget->has16BitInsts());
2664 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2665 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2666 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2667 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2668 }
2669
2670 auto [ScaledInput, IsLtSmallestNormal] =
2671 getScaledLogInput(DAG, SL, Src, Flags);
2672 if (!ScaledInput)
2673 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2674
2675 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2676
2677 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2678 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2679 SDValue ResultOffset =
2680 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2681 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2682 }
2683
getMad(SelectionDAG & DAG,const SDLoc & SL,EVT VT,SDValue X,SDValue Y,SDValue C,SDNodeFlags Flags=SDNodeFlags ())2684 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2685 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2686 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2687 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2688 }
2689
LowerFLOGCommon(SDValue Op,SelectionDAG & DAG) const2690 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2691 SelectionDAG &DAG) const {
2692 SDValue X = Op.getOperand(0);
2693 EVT VT = Op.getValueType();
2694 SDNodeFlags Flags = Op->getFlags();
2695 SDLoc DL(Op);
2696
2697 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2698 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2699
2700 const auto &Options = getTargetMachine().Options;
2701 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2702 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2703
2704 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2705 // Log and multiply in f32 is good enough for f16.
2706 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2707 }
2708
2709 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2710 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2711 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2712 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2713 }
2714
2715 return Lowered;
2716 }
2717
2718 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2719 if (ScaledInput)
2720 X = ScaledInput;
2721
2722 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2723
2724 SDValue R;
2725 if (Subtarget->hasFastFMAF32()) {
2726 // c+cc are ln(2)/ln(10) to more than 49 bits
2727 const float c_log10 = 0x1.344134p-2f;
2728 const float cc_log10 = 0x1.09f79ep-26f;
2729
2730 // c + cc is ln(2) to more than 49 bits
2731 const float c_log = 0x1.62e42ep-1f;
2732 const float cc_log = 0x1.efa39ep-25f;
2733
2734 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2735 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2736
2737 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2738 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2739 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2740 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2741 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2742 } else {
2743 // ch+ct is ln(2)/ln(10) to more than 36 bits
2744 const float ch_log10 = 0x1.344000p-2f;
2745 const float ct_log10 = 0x1.3509f6p-18f;
2746
2747 // ch + ct is ln(2) to more than 36 bits
2748 const float ch_log = 0x1.62e000p-1f;
2749 const float ct_log = 0x1.0bfbe8p-15f;
2750
2751 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2752 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2753
2754 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2755 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2756 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2757 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2758 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2759
2760 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2761 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2762 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2763 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2764 }
2765
2766 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2767 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2768
2769 // TODO: Check if known finite from source value.
2770 if (!IsFiniteOnly) {
2771 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2772 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2773 }
2774
2775 if (IsScaled) {
2776 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2777 SDValue ShiftK =
2778 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2779 SDValue Shift =
2780 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2781 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2782 }
2783
2784 return R;
2785 }
2786
LowerFLOG10(SDValue Op,SelectionDAG & DAG) const2787 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2788 return LowerFLOGCommon(Op, DAG);
2789 }
2790
2791 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2792 // promote f16 operation.
LowerFLOGUnsafe(SDValue Src,const SDLoc & SL,SelectionDAG & DAG,bool IsLog10,SDNodeFlags Flags) const2793 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2794 SelectionDAG &DAG, bool IsLog10,
2795 SDNodeFlags Flags) const {
2796 EVT VT = Src.getValueType();
2797 unsigned LogOp =
2798 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2799
2800 double Log2BaseInverted =
2801 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2802
2803 if (VT == MVT::f32) {
2804 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2805 if (ScaledInput) {
2806 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2807 SDValue ScaledResultOffset =
2808 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2809
2810 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2811
2812 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2813 ScaledResultOffset, Zero, Flags);
2814
2815 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2816
2817 if (Subtarget->hasFastFMAF32())
2818 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2819 Flags);
2820 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2821 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2822 }
2823 }
2824
2825 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2826 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2827
2828 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2829 Flags);
2830 }
2831
lowerFEXP2(SDValue Op,SelectionDAG & DAG) const2832 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2833 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2834 // If we have to handle denormals, scale up the input and adjust the result.
2835
2836 SDLoc SL(Op);
2837 EVT VT = Op.getValueType();
2838 SDValue Src = Op.getOperand(0);
2839 SDNodeFlags Flags = Op->getFlags();
2840
2841 if (VT == MVT::f16) {
2842 // Nothing in half is a denormal when promoted to f32.
2843 assert(!Subtarget->has16BitInsts());
2844 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2845 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2846 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2847 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2848 }
2849
2850 assert(VT == MVT::f32);
2851
2852 if (!needsDenormHandlingF32(DAG, Src, Flags))
2853 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2854
2855 // bool needs_scaling = x < -0x1.f80000p+6f;
2856 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2857
2858 // -nextafter(128.0, -1)
2859 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2860
2861 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2862
2863 SDValue NeedsScaling =
2864 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2865
2866 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2867 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2868
2869 SDValue AddOffset =
2870 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2871
2872 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2873 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2874
2875 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2876 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2877 SDValue ResultScale =
2878 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2879
2880 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2881 }
2882
lowerFEXPUnsafe(SDValue X,const SDLoc & SL,SelectionDAG & DAG,SDNodeFlags Flags) const2883 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2884 SelectionDAG &DAG,
2885 SDNodeFlags Flags) const {
2886 EVT VT = X.getValueType();
2887 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2888
2889 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2890 // exp2(M_LOG2E_F * f);
2891 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2892 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2893 : (unsigned)ISD::FEXP2,
2894 SL, VT, Mul, Flags);
2895 }
2896
2897 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2898
2899 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2900 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2901
2902 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2903
2904 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2905
2906 SDValue AdjustedX =
2907 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2908
2909 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2910
2911 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2912
2913 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2914 SDValue AdjustedResult =
2915 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2916
2917 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2918 Flags);
2919 }
2920
2921 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2922 /// handled correctly.
lowerFEXP10Unsafe(SDValue X,const SDLoc & SL,SelectionDAG & DAG,SDNodeFlags Flags) const2923 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2924 SelectionDAG &DAG,
2925 SDNodeFlags Flags) const {
2926 const EVT VT = X.getValueType();
2927 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2928
2929 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2930 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2931 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2932 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2933
2934 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2935 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2936 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2937 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2938 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2939 }
2940
2941 // bool s = x < -0x1.2f7030p+5f;
2942 // x += s ? 0x1.0p+5f : 0.0f;
2943 // exp10 = exp2(x * 0x1.a92000p+1f) *
2944 // exp2(x * 0x1.4f0978p-11f) *
2945 // (s ? 0x1.9f623ep-107f : 1.0f);
2946
2947 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2948
2949 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2950 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2951
2952 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2953 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2954 SDValue AdjustedX =
2955 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2956
2957 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2958 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2959
2960 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2961 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2962 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2963 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2964
2965 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2966
2967 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2968 SDValue AdjustedResult =
2969 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2970
2971 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2972 Flags);
2973 }
2974
lowerFEXP(SDValue Op,SelectionDAG & DAG) const2975 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2976 EVT VT = Op.getValueType();
2977 SDLoc SL(Op);
2978 SDValue X = Op.getOperand(0);
2979 SDNodeFlags Flags = Op->getFlags();
2980 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2981
2982 if (VT.getScalarType() == MVT::f16) {
2983 // v_exp_f16 (fmul x, log2e)
2984 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2985 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2986
2987 if (VT.isVector())
2988 return SDValue();
2989
2990 // exp(f16 x) ->
2991 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2992
2993 // Nothing in half is a denormal when promoted to f32.
2994 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2995 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2996 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2997 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2998 }
2999
3000 assert(VT == MVT::f32);
3001
3002 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3003 // library behavior. Also, is known-not-daz source sufficient?
3004 if (allowApproxFunc(DAG, Flags)) {
3005 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3006 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3007 }
3008
3009 // Algorithm:
3010 //
3011 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3012 //
3013 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3014 // n = 64*m + j, 0 <= j < 64
3015 //
3016 // e^x = 2^((64*m + j + f)/64)
3017 // = (2^m) * (2^(j/64)) * 2^(f/64)
3018 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3019 //
3020 // f = x*(64/ln(2)) - n
3021 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3022 //
3023 // e^x = (2^m) * (2^(j/64)) * e^r
3024 //
3025 // (2^(j/64)) is precomputed
3026 //
3027 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3028 // e^r = 1 + q
3029 //
3030 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3031 //
3032 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3033 SDNodeFlags FlagsNoContract = Flags;
3034 FlagsNoContract.setAllowContract(false);
3035
3036 SDValue PH, PL;
3037 if (Subtarget->hasFastFMAF32()) {
3038 const float c_exp = numbers::log2ef;
3039 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3040 const float c_exp10 = 0x1.a934f0p+1f;
3041 const float cc_exp10 = 0x1.2f346ep-24f;
3042
3043 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3044 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3045
3046 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3047 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3048 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3049 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3050 } else {
3051 const float ch_exp = 0x1.714000p+0f;
3052 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3053
3054 const float ch_exp10 = 0x1.a92000p+1f;
3055 const float cl_exp10 = 0x1.4f0978p-11f;
3056
3057 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3058 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3059
3060 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3061 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3062 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3063 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3064 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3065
3066 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3067
3068 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3069 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3070 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3071 }
3072
3073 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3074
3075 // It is unsafe to contract this fsub into the PH multiply.
3076 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3077
3078 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3079 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3080 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3081
3082 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3083
3084 SDValue UnderflowCheckConst =
3085 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3086
3087 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3088 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3089 SDValue Underflow =
3090 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3091
3092 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3093 const auto &Options = getTargetMachine().Options;
3094
3095 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3096 SDValue OverflowCheckConst =
3097 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3098 SDValue Overflow =
3099 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3100 SDValue Inf =
3101 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);
3102 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3103 }
3104
3105 return R;
3106 }
3107
isCtlzOpc(unsigned Opc)3108 static bool isCtlzOpc(unsigned Opc) {
3109 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3110 }
3111
isCttzOpc(unsigned Opc)3112 static bool isCttzOpc(unsigned Opc) {
3113 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3114 }
3115
lowerCTLZResults(SDValue Op,SelectionDAG & DAG) const3116 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3117 SelectionDAG &DAG) const {
3118 auto SL = SDLoc(Op);
3119 auto Opc = Op.getOpcode();
3120 auto Arg = Op.getOperand(0u);
3121 auto ResultVT = Op.getValueType();
3122
3123 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3124 return {};
3125
3126 assert(isCtlzOpc(Opc));
3127 assert(ResultVT == Arg.getValueType());
3128
3129 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3130 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3131 SDValue NewOp;
3132
3133 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3134 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3135 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3136 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3137 } else {
3138 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3139 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3140 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3141 }
3142
3143 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3144 }
3145
LowerCTLZ_CTTZ(SDValue Op,SelectionDAG & DAG) const3146 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3147 SDLoc SL(Op);
3148 SDValue Src = Op.getOperand(0);
3149
3150 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3151 bool Ctlz = isCtlzOpc(Op.getOpcode());
3152 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3153
3154 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3155 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3156 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3157
3158 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3159 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3160 // (cttz hi:lo) -> (umin (ffbl src), 32)
3161 // (ctlz_zero_undef src) -> (ffbh src)
3162 // (cttz_zero_undef src) -> (ffbl src)
3163
3164 // 64-bit scalar version produce 32-bit result
3165 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3166 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3167 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3168 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3169 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3170 if (!ZeroUndef) {
3171 const SDValue ConstVal = DAG.getConstant(
3172 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3173 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3174 }
3175 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3176 }
3177
3178 SDValue Lo, Hi;
3179 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3180
3181 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3182 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3183
3184 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3185 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3186 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3187 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3188
3189 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3190 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3191 if (Ctlz)
3192 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3193 else
3194 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3195
3196 SDValue NewOpr;
3197 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3198 if (!ZeroUndef) {
3199 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3200 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3201 }
3202
3203 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3204 }
3205
LowerINT_TO_FP32(SDValue Op,SelectionDAG & DAG,bool Signed) const3206 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3207 bool Signed) const {
3208 // The regular method converting a 64-bit integer to float roughly consists of
3209 // 2 steps: normalization and rounding. In fact, after normalization, the
3210 // conversion from a 64-bit integer to a float is essentially the same as the
3211 // one from a 32-bit integer. The only difference is that it has more
3212 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3213 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3214 // converted into the correct float number. The basic steps for the unsigned
3215 // conversion are illustrated in the following pseudo code:
3216 //
3217 // f32 uitofp(i64 u) {
3218 // i32 hi, lo = split(u);
3219 // // Only count the leading zeros in hi as we have native support of the
3220 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3221 // // reduced to a 32-bit one automatically.
3222 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3223 // u <<= shamt;
3224 // hi, lo = split(u);
3225 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3226 // // convert it as a 32-bit integer and scale the result back.
3227 // return uitofp(hi) * 2^(32 - shamt);
3228 // }
3229 //
3230 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3231 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3232 // converted instead followed by negation based its sign bit.
3233
3234 SDLoc SL(Op);
3235 SDValue Src = Op.getOperand(0);
3236
3237 SDValue Lo, Hi;
3238 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3239 SDValue Sign;
3240 SDValue ShAmt;
3241 if (Signed && Subtarget->isGCN()) {
3242 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3243 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3244 // account. That is, the maximal shift is
3245 // - 32 if Lo and Hi have opposite signs;
3246 // - 33 if Lo and Hi have the same sign.
3247 //
3248 // Or, MaxShAmt = 33 + OppositeSign, where
3249 //
3250 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3251 // - -1 if Lo and Hi have opposite signs; and
3252 // - 0 otherwise.
3253 //
3254 // All in all, ShAmt is calculated as
3255 //
3256 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3257 //
3258 // or
3259 //
3260 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3261 //
3262 // to reduce the critical path.
3263 SDValue OppositeSign = DAG.getNode(
3264 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3265 DAG.getConstant(31, SL, MVT::i32));
3266 SDValue MaxShAmt =
3267 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3268 OppositeSign);
3269 // Count the leading sign bits.
3270 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3271 // Different from unsigned conversion, the shift should be one bit less to
3272 // preserve the sign bit.
3273 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3274 DAG.getConstant(1, SL, MVT::i32));
3275 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3276 } else {
3277 if (Signed) {
3278 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3279 // absolute value first.
3280 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3281 DAG.getConstant(63, SL, MVT::i64));
3282 SDValue Abs =
3283 DAG.getNode(ISD::XOR, SL, MVT::i64,
3284 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3285 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3286 }
3287 // Count the leading zeros.
3288 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3289 // The shift amount for signed integers is [0, 32].
3290 }
3291 // Normalize the given 64-bit integer.
3292 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3293 // Split it again.
3294 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3295 // Calculate the adjust bit for rounding.
3296 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3297 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3298 DAG.getConstant(1, SL, MVT::i32), Lo);
3299 // Get the 32-bit normalized integer.
3300 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3301 // Convert the normalized 32-bit integer into f32.
3302 unsigned Opc =
3303 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3304 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3305
3306 // Finally, need to scale back the converted floating number as the original
3307 // 64-bit integer is converted as a 32-bit one.
3308 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3309 ShAmt);
3310 // On GCN, use LDEXP directly.
3311 if (Subtarget->isGCN())
3312 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3313
3314 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3315 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3316 // exponent is enough to avoid overflowing into the sign bit.
3317 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3318 DAG.getConstant(23, SL, MVT::i32));
3319 SDValue IVal =
3320 DAG.getNode(ISD::ADD, SL, MVT::i32,
3321 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3322 if (Signed) {
3323 // Set the sign bit.
3324 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3325 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3326 DAG.getConstant(31, SL, MVT::i32));
3327 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3328 }
3329 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3330 }
3331
LowerINT_TO_FP64(SDValue Op,SelectionDAG & DAG,bool Signed) const3332 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3333 bool Signed) const {
3334 SDLoc SL(Op);
3335 SDValue Src = Op.getOperand(0);
3336
3337 SDValue Lo, Hi;
3338 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3339
3340 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3341 SL, MVT::f64, Hi);
3342
3343 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3344
3345 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3346 DAG.getConstant(32, SL, MVT::i32));
3347 // TODO: Should this propagate fast-math-flags?
3348 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3349 }
3350
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const3351 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3352 SelectionDAG &DAG) const {
3353 // TODO: Factor out code common with LowerSINT_TO_FP.
3354 EVT DestVT = Op.getValueType();
3355 SDValue Src = Op.getOperand(0);
3356 EVT SrcVT = Src.getValueType();
3357
3358 if (SrcVT == MVT::i16) {
3359 if (DestVT == MVT::f16)
3360 return Op;
3361 SDLoc DL(Op);
3362
3363 // Promote src to i32
3364 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3365 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3366 }
3367
3368 if (DestVT == MVT::bf16) {
3369 SDLoc SL(Op);
3370 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3371 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3372 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3373 }
3374
3375 if (SrcVT != MVT::i64)
3376 return Op;
3377
3378 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3379 SDLoc DL(Op);
3380
3381 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3382 SDValue FPRoundFlag =
3383 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3384 SDValue FPRound =
3385 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3386
3387 return FPRound;
3388 }
3389
3390 if (DestVT == MVT::f32)
3391 return LowerINT_TO_FP32(Op, DAG, false);
3392
3393 assert(DestVT == MVT::f64);
3394 return LowerINT_TO_FP64(Op, DAG, false);
3395 }
3396
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const3397 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3398 SelectionDAG &DAG) const {
3399 EVT DestVT = Op.getValueType();
3400
3401 SDValue Src = Op.getOperand(0);
3402 EVT SrcVT = Src.getValueType();
3403
3404 if (SrcVT == MVT::i16) {
3405 if (DestVT == MVT::f16)
3406 return Op;
3407
3408 SDLoc DL(Op);
3409 // Promote src to i32
3410 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3411 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3412 }
3413
3414 if (DestVT == MVT::bf16) {
3415 SDLoc SL(Op);
3416 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3417 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3418 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3419 }
3420
3421 if (SrcVT != MVT::i64)
3422 return Op;
3423
3424 // TODO: Factor out code common with LowerUINT_TO_FP.
3425
3426 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3427 SDLoc DL(Op);
3428 SDValue Src = Op.getOperand(0);
3429
3430 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3431 SDValue FPRoundFlag =
3432 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3433 SDValue FPRound =
3434 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3435
3436 return FPRound;
3437 }
3438
3439 if (DestVT == MVT::f32)
3440 return LowerINT_TO_FP32(Op, DAG, true);
3441
3442 assert(DestVT == MVT::f64);
3443 return LowerINT_TO_FP64(Op, DAG, true);
3444 }
3445
LowerFP_TO_INT64(SDValue Op,SelectionDAG & DAG,bool Signed) const3446 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3447 bool Signed) const {
3448 SDLoc SL(Op);
3449
3450 SDValue Src = Op.getOperand(0);
3451 EVT SrcVT = Src.getValueType();
3452
3453 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3454
3455 // The basic idea of converting a floating point number into a pair of 32-bit
3456 // integers is illustrated as follows:
3457 //
3458 // tf := trunc(val);
3459 // hif := floor(tf * 2^-32);
3460 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3461 // hi := fptoi(hif);
3462 // lo := fptoi(lof);
3463 //
3464 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3465 SDValue Sign;
3466 if (Signed && SrcVT == MVT::f32) {
3467 // However, a 32-bit floating point number has only 23 bits mantissa and
3468 // it's not enough to hold all the significant bits of `lof` if val is
3469 // negative. To avoid the loss of precision, We need to take the absolute
3470 // value after truncating and flip the result back based on the original
3471 // signedness.
3472 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3473 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3474 DAG.getConstant(31, SL, MVT::i32));
3475 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3476 }
3477
3478 SDValue K0, K1;
3479 if (SrcVT == MVT::f64) {
3480 K0 = DAG.getConstantFP(
3481 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3482 SrcVT);
3483 K1 = DAG.getConstantFP(
3484 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3485 SrcVT);
3486 } else {
3487 K0 = DAG.getConstantFP(
3488 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3489 K1 = DAG.getConstantFP(
3490 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3491 }
3492 // TODO: Should this propagate fast-math-flags?
3493 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3494
3495 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3496
3497 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3498
3499 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3500 : ISD::FP_TO_UINT,
3501 SL, MVT::i32, FloorMul);
3502 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3503
3504 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3505 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3506
3507 if (Signed && SrcVT == MVT::f32) {
3508 assert(Sign);
3509 // Flip the result based on the signedness, which is either all 0s or 1s.
3510 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3511 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3512 // r := xor(r, sign) - sign;
3513 Result =
3514 DAG.getNode(ISD::SUB, SL, MVT::i64,
3515 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3516 }
3517
3518 return Result;
3519 }
3520
LowerFP_TO_FP16(SDValue Op,SelectionDAG & DAG) const3521 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3522 SDLoc DL(Op);
3523 SDValue N0 = Op.getOperand(0);
3524
3525 // Convert to target node to get known bits
3526 if (N0.getValueType() == MVT::f32)
3527 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3528
3529 if (getTargetMachine().Options.UnsafeFPMath) {
3530 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3531 return SDValue();
3532 }
3533
3534 assert(N0.getSimpleValueType() == MVT::f64);
3535
3536 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3537 const unsigned ExpMask = 0x7ff;
3538 const unsigned ExpBiasf64 = 1023;
3539 const unsigned ExpBiasf16 = 15;
3540 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3541 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3542 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3543 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3544 DAG.getConstant(32, DL, MVT::i64));
3545 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3546 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3547 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3548 DAG.getConstant(20, DL, MVT::i64));
3549 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3550 DAG.getConstant(ExpMask, DL, MVT::i32));
3551 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3552 // add the f16 bias (15) to get the biased exponent for the f16 format.
3553 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3554 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3555
3556 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3557 DAG.getConstant(8, DL, MVT::i32));
3558 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3559 DAG.getConstant(0xffe, DL, MVT::i32));
3560
3561 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3562 DAG.getConstant(0x1ff, DL, MVT::i32));
3563 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3564
3565 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3566 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3567
3568 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3569 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3570 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3571 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3572
3573 // N = M | (E << 12);
3574 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3575 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3576 DAG.getConstant(12, DL, MVT::i32)));
3577
3578 // B = clamp(1-E, 0, 13);
3579 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3580 One, E);
3581 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3582 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3583 DAG.getConstant(13, DL, MVT::i32));
3584
3585 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3586 DAG.getConstant(0x1000, DL, MVT::i32));
3587
3588 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3589 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3590 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3591 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3592
3593 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3594 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3595 DAG.getConstant(0x7, DL, MVT::i32));
3596 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3597 DAG.getConstant(2, DL, MVT::i32));
3598 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3599 One, Zero, ISD::SETEQ);
3600 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3601 One, Zero, ISD::SETGT);
3602 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3603 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3604
3605 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3606 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3607 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3608 I, V, ISD::SETEQ);
3609
3610 // Extract the sign bit.
3611 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3612 DAG.getConstant(16, DL, MVT::i32));
3613 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3614 DAG.getConstant(0x8000, DL, MVT::i32));
3615
3616 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3617 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3618 }
3619
LowerFP_TO_INT(const SDValue Op,SelectionDAG & DAG) const3620 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3621 SelectionDAG &DAG) const {
3622 SDValue Src = Op.getOperand(0);
3623 unsigned OpOpcode = Op.getOpcode();
3624 EVT SrcVT = Src.getValueType();
3625 EVT DestVT = Op.getValueType();
3626
3627 // Will be selected natively
3628 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3629 return Op;
3630
3631 if (SrcVT == MVT::bf16) {
3632 SDLoc DL(Op);
3633 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3634 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3635 }
3636
3637 // Promote i16 to i32
3638 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3639 SDLoc DL(Op);
3640
3641 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3642 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3643 }
3644
3645 if (DestVT != MVT::i64)
3646 return Op;
3647
3648 if (SrcVT == MVT::f16 ||
3649 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3650 SDLoc DL(Op);
3651
3652 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3653 unsigned Ext =
3654 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3655 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3656 }
3657
3658 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3659 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3660
3661 return SDValue();
3662 }
3663
LowerSIGN_EXTEND_INREG(SDValue Op,SelectionDAG & DAG) const3664 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3665 SelectionDAG &DAG) const {
3666 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3667 MVT VT = Op.getSimpleValueType();
3668 MVT ScalarVT = VT.getScalarType();
3669
3670 assert(VT.isVector());
3671
3672 SDValue Src = Op.getOperand(0);
3673 SDLoc DL(Op);
3674
3675 // TODO: Don't scalarize on Evergreen?
3676 unsigned NElts = VT.getVectorNumElements();
3677 SmallVector<SDValue, 8> Args;
3678 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3679
3680 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3681 for (unsigned I = 0; I < NElts; ++I)
3682 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3683
3684 return DAG.getBuildVector(VT, DL, Args);
3685 }
3686
3687 //===----------------------------------------------------------------------===//
3688 // Custom DAG optimizations
3689 //===----------------------------------------------------------------------===//
3690
isU24(SDValue Op,SelectionDAG & DAG)3691 static bool isU24(SDValue Op, SelectionDAG &DAG) {
3692 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3693 }
3694
isI24(SDValue Op,SelectionDAG & DAG)3695 static bool isI24(SDValue Op, SelectionDAG &DAG) {
3696 EVT VT = Op.getValueType();
3697 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3698 // as unsigned 24-bit values.
3699 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3700 }
3701
simplifyMul24(SDNode * Node24,TargetLowering::DAGCombinerInfo & DCI)3702 static SDValue simplifyMul24(SDNode *Node24,
3703 TargetLowering::DAGCombinerInfo &DCI) {
3704 SelectionDAG &DAG = DCI.DAG;
3705 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3706 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3707
3708 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3709 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3710 unsigned NewOpcode = Node24->getOpcode();
3711 if (IsIntrin) {
3712 unsigned IID = Node24->getConstantOperandVal(0);
3713 switch (IID) {
3714 case Intrinsic::amdgcn_mul_i24:
3715 NewOpcode = AMDGPUISD::MUL_I24;
3716 break;
3717 case Intrinsic::amdgcn_mul_u24:
3718 NewOpcode = AMDGPUISD::MUL_U24;
3719 break;
3720 case Intrinsic::amdgcn_mulhi_i24:
3721 NewOpcode = AMDGPUISD::MULHI_I24;
3722 break;
3723 case Intrinsic::amdgcn_mulhi_u24:
3724 NewOpcode = AMDGPUISD::MULHI_U24;
3725 break;
3726 default:
3727 llvm_unreachable("Expected 24-bit mul intrinsic");
3728 }
3729 }
3730
3731 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3732
3733 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3734 // the operands to have other uses, but will only perform simplifications that
3735 // involve bypassing some nodes for this user.
3736 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3737 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3738 if (DemandedLHS || DemandedRHS)
3739 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3740 DemandedLHS ? DemandedLHS : LHS,
3741 DemandedRHS ? DemandedRHS : RHS);
3742
3743 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3744 // operands if this node is the only user.
3745 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3746 return SDValue(Node24, 0);
3747 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3748 return SDValue(Node24, 0);
3749
3750 return SDValue();
3751 }
3752
3753 template <typename IntTy>
constantFoldBFE(SelectionDAG & DAG,IntTy Src0,uint32_t Offset,uint32_t Width,const SDLoc & DL)3754 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3755 uint32_t Width, const SDLoc &DL) {
3756 if (Width + Offset < 32) {
3757 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3758 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3759 return DAG.getConstant(Result, DL, MVT::i32);
3760 }
3761
3762 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3763 }
3764
hasVolatileUser(SDNode * Val)3765 static bool hasVolatileUser(SDNode *Val) {
3766 for (SDNode *U : Val->uses()) {
3767 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3768 if (M->isVolatile())
3769 return true;
3770 }
3771 }
3772
3773 return false;
3774 }
3775
shouldCombineMemoryType(EVT VT) const3776 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3777 // i32 vectors are the canonical memory type.
3778 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3779 return false;
3780
3781 if (!VT.isByteSized())
3782 return false;
3783
3784 unsigned Size = VT.getStoreSize();
3785
3786 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3787 return false;
3788
3789 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3790 return false;
3791
3792 return true;
3793 }
3794
3795 // Replace load of an illegal type with a store of a bitcast to a friendlier
3796 // type.
performLoadCombine(SDNode * N,DAGCombinerInfo & DCI) const3797 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3798 DAGCombinerInfo &DCI) const {
3799 if (!DCI.isBeforeLegalize())
3800 return SDValue();
3801
3802 LoadSDNode *LN = cast<LoadSDNode>(N);
3803 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3804 return SDValue();
3805
3806 SDLoc SL(N);
3807 SelectionDAG &DAG = DCI.DAG;
3808 EVT VT = LN->getMemoryVT();
3809
3810 unsigned Size = VT.getStoreSize();
3811 Align Alignment = LN->getAlign();
3812 if (Alignment < Size && isTypeLegal(VT)) {
3813 unsigned IsFast;
3814 unsigned AS = LN->getAddressSpace();
3815
3816 // Expand unaligned loads earlier than legalization. Due to visitation order
3817 // problems during legalization, the emitted instructions to pack and unpack
3818 // the bytes again are not eliminated in the case of an unaligned copy.
3819 if (!allowsMisalignedMemoryAccesses(
3820 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3821 if (VT.isVector())
3822 return SplitVectorLoad(SDValue(LN, 0), DAG);
3823
3824 SDValue Ops[2];
3825 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3826
3827 return DAG.getMergeValues(Ops, SDLoc(N));
3828 }
3829
3830 if (!IsFast)
3831 return SDValue();
3832 }
3833
3834 if (!shouldCombineMemoryType(VT))
3835 return SDValue();
3836
3837 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3838
3839 SDValue NewLoad
3840 = DAG.getLoad(NewVT, SL, LN->getChain(),
3841 LN->getBasePtr(), LN->getMemOperand());
3842
3843 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3844 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3845 return SDValue(N, 0);
3846 }
3847
3848 // Replace store of an illegal type with a store of a bitcast to a friendlier
3849 // type.
performStoreCombine(SDNode * N,DAGCombinerInfo & DCI) const3850 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3851 DAGCombinerInfo &DCI) const {
3852 if (!DCI.isBeforeLegalize())
3853 return SDValue();
3854
3855 StoreSDNode *SN = cast<StoreSDNode>(N);
3856 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3857 return SDValue();
3858
3859 EVT VT = SN->getMemoryVT();
3860 unsigned Size = VT.getStoreSize();
3861
3862 SDLoc SL(N);
3863 SelectionDAG &DAG = DCI.DAG;
3864 Align Alignment = SN->getAlign();
3865 if (Alignment < Size && isTypeLegal(VT)) {
3866 unsigned IsFast;
3867 unsigned AS = SN->getAddressSpace();
3868
3869 // Expand unaligned stores earlier than legalization. Due to visitation
3870 // order problems during legalization, the emitted instructions to pack and
3871 // unpack the bytes again are not eliminated in the case of an unaligned
3872 // copy.
3873 if (!allowsMisalignedMemoryAccesses(
3874 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3875 if (VT.isVector())
3876 return SplitVectorStore(SDValue(SN, 0), DAG);
3877
3878 return expandUnalignedStore(SN, DAG);
3879 }
3880
3881 if (!IsFast)
3882 return SDValue();
3883 }
3884
3885 if (!shouldCombineMemoryType(VT))
3886 return SDValue();
3887
3888 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3889 SDValue Val = SN->getValue();
3890
3891 //DCI.AddToWorklist(Val.getNode());
3892
3893 bool OtherUses = !Val.hasOneUse();
3894 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3895 if (OtherUses) {
3896 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3897 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3898 }
3899
3900 return DAG.getStore(SN->getChain(), SL, CastVal,
3901 SN->getBasePtr(), SN->getMemOperand());
3902 }
3903
3904 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3905 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3906 // issues.
performAssertSZExtCombine(SDNode * N,DAGCombinerInfo & DCI) const3907 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3908 DAGCombinerInfo &DCI) const {
3909 SelectionDAG &DAG = DCI.DAG;
3910 SDValue N0 = N->getOperand(0);
3911
3912 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3913 // (vt2 (truncate (assertzext vt0:x, vt1)))
3914 if (N0.getOpcode() == ISD::TRUNCATE) {
3915 SDValue N1 = N->getOperand(1);
3916 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3917 SDLoc SL(N);
3918
3919 SDValue Src = N0.getOperand(0);
3920 EVT SrcVT = Src.getValueType();
3921 if (SrcVT.bitsGE(ExtVT)) {
3922 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3923 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3924 }
3925 }
3926
3927 return SDValue();
3928 }
3929
performIntrinsicWOChainCombine(SDNode * N,DAGCombinerInfo & DCI) const3930 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3931 SDNode *N, DAGCombinerInfo &DCI) const {
3932 unsigned IID = N->getConstantOperandVal(0);
3933 switch (IID) {
3934 case Intrinsic::amdgcn_mul_i24:
3935 case Intrinsic::amdgcn_mul_u24:
3936 case Intrinsic::amdgcn_mulhi_i24:
3937 case Intrinsic::amdgcn_mulhi_u24:
3938 return simplifyMul24(N, DCI);
3939 case Intrinsic::amdgcn_fract:
3940 case Intrinsic::amdgcn_rsq:
3941 case Intrinsic::amdgcn_rcp_legacy:
3942 case Intrinsic::amdgcn_rsq_legacy:
3943 case Intrinsic::amdgcn_rsq_clamp: {
3944 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3945 SDValue Src = N->getOperand(1);
3946 return Src.isUndef() ? Src : SDValue();
3947 }
3948 case Intrinsic::amdgcn_frexp_exp: {
3949 // frexp_exp (fneg x) -> frexp_exp x
3950 // frexp_exp (fabs x) -> frexp_exp x
3951 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3952 SDValue Src = N->getOperand(1);
3953 SDValue PeekSign = peekFPSignOps(Src);
3954 if (PeekSign == Src)
3955 return SDValue();
3956 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3957 0);
3958 }
3959 default:
3960 return SDValue();
3961 }
3962 }
3963
3964 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3965 /// binary operation \p Opc to it with the corresponding constant operands.
splitBinaryBitConstantOpImpl(DAGCombinerInfo & DCI,const SDLoc & SL,unsigned Opc,SDValue LHS,uint32_t ValLo,uint32_t ValHi) const3966 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3967 DAGCombinerInfo &DCI, const SDLoc &SL,
3968 unsigned Opc, SDValue LHS,
3969 uint32_t ValLo, uint32_t ValHi) const {
3970 SelectionDAG &DAG = DCI.DAG;
3971 SDValue Lo, Hi;
3972 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3973
3974 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3975 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3976
3977 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3978 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3979
3980 // Re-visit the ands. It's possible we eliminated one of them and it could
3981 // simplify the vector.
3982 DCI.AddToWorklist(Lo.getNode());
3983 DCI.AddToWorklist(Hi.getNode());
3984
3985 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3986 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3987 }
3988
performShlCombine(SDNode * N,DAGCombinerInfo & DCI) const3989 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3990 DAGCombinerInfo &DCI) const {
3991 EVT VT = N->getValueType(0);
3992
3993 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3994 if (!RHS)
3995 return SDValue();
3996
3997 SDValue LHS = N->getOperand(0);
3998 unsigned RHSVal = RHS->getZExtValue();
3999 if (!RHSVal)
4000 return LHS;
4001
4002 SDLoc SL(N);
4003 SelectionDAG &DAG = DCI.DAG;
4004
4005 switch (LHS->getOpcode()) {
4006 default:
4007 break;
4008 case ISD::ZERO_EXTEND:
4009 case ISD::SIGN_EXTEND:
4010 case ISD::ANY_EXTEND: {
4011 SDValue X = LHS->getOperand(0);
4012
4013 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4014 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4015 // Prefer build_vector as the canonical form if packed types are legal.
4016 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4017 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4018 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4019 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4020 }
4021
4022 // shl (ext x) => zext (shl x), if shift does not overflow int
4023 if (VT != MVT::i64)
4024 break;
4025 KnownBits Known = DAG.computeKnownBits(X);
4026 unsigned LZ = Known.countMinLeadingZeros();
4027 if (LZ < RHSVal)
4028 break;
4029 EVT XVT = X.getValueType();
4030 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4031 return DAG.getZExtOrTrunc(Shl, SL, VT);
4032 }
4033 }
4034
4035 if (VT != MVT::i64)
4036 return SDValue();
4037
4038 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4039
4040 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4041 // common case, splitting this into a move and a 32-bit shift is faster and
4042 // the same code size.
4043 if (RHSVal < 32)
4044 return SDValue();
4045
4046 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4047
4048 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4049 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4050
4051 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4052
4053 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4054 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4055 }
4056
performSraCombine(SDNode * N,DAGCombinerInfo & DCI) const4057 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4058 DAGCombinerInfo &DCI) const {
4059 if (N->getValueType(0) != MVT::i64)
4060 return SDValue();
4061
4062 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4063 if (!RHS)
4064 return SDValue();
4065
4066 SelectionDAG &DAG = DCI.DAG;
4067 SDLoc SL(N);
4068 unsigned RHSVal = RHS->getZExtValue();
4069
4070 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4071 if (RHSVal == 32) {
4072 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4073 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4074 DAG.getConstant(31, SL, MVT::i32));
4075
4076 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4077 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4078 }
4079
4080 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4081 if (RHSVal == 63) {
4082 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4083 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4084 DAG.getConstant(31, SL, MVT::i32));
4085 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4086 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4087 }
4088
4089 return SDValue();
4090 }
4091
performSrlCombine(SDNode * N,DAGCombinerInfo & DCI) const4092 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4093 DAGCombinerInfo &DCI) const {
4094 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4095 if (!RHS)
4096 return SDValue();
4097
4098 EVT VT = N->getValueType(0);
4099 SDValue LHS = N->getOperand(0);
4100 unsigned ShiftAmt = RHS->getZExtValue();
4101 SelectionDAG &DAG = DCI.DAG;
4102 SDLoc SL(N);
4103
4104 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4105 // this improves the ability to match BFE patterns in isel.
4106 if (LHS.getOpcode() == ISD::AND) {
4107 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4108 unsigned MaskIdx, MaskLen;
4109 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4110 MaskIdx == ShiftAmt) {
4111 return DAG.getNode(
4112 ISD::AND, SL, VT,
4113 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4114 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4115 }
4116 }
4117 }
4118
4119 if (VT != MVT::i64)
4120 return SDValue();
4121
4122 if (ShiftAmt < 32)
4123 return SDValue();
4124
4125 // srl i64:x, C for C >= 32
4126 // =>
4127 // build_pair (srl hi_32(x), C - 32), 0
4128 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4129
4130 SDValue Hi = getHiHalf64(LHS, DAG);
4131
4132 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4133 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4134
4135 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4136
4137 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4138 }
4139
performTruncateCombine(SDNode * N,DAGCombinerInfo & DCI) const4140 SDValue AMDGPUTargetLowering::performTruncateCombine(
4141 SDNode *N, DAGCombinerInfo &DCI) const {
4142 SDLoc SL(N);
4143 SelectionDAG &DAG = DCI.DAG;
4144 EVT VT = N->getValueType(0);
4145 SDValue Src = N->getOperand(0);
4146
4147 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4148 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4149 SDValue Vec = Src.getOperand(0);
4150 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4151 SDValue Elt0 = Vec.getOperand(0);
4152 EVT EltVT = Elt0.getValueType();
4153 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4154 if (EltVT.isFloatingPoint()) {
4155 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4156 EltVT.changeTypeToInteger(), Elt0);
4157 }
4158
4159 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4160 }
4161 }
4162 }
4163
4164 // Equivalent of above for accessing the high element of a vector as an
4165 // integer operation.
4166 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4167 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4168 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4169 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4170 SDValue BV = stripBitcast(Src.getOperand(0));
4171 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4172 BV.getValueType().getVectorNumElements() == 2) {
4173 SDValue SrcElt = BV.getOperand(1);
4174 EVT SrcEltVT = SrcElt.getValueType();
4175 if (SrcEltVT.isFloatingPoint()) {
4176 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4177 SrcEltVT.changeTypeToInteger(), SrcElt);
4178 }
4179
4180 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4181 }
4182 }
4183 }
4184 }
4185
4186 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4187 //
4188 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4189 // i16 (trunc (srl (i32 (trunc x), K)))
4190 if (VT.getScalarSizeInBits() < 32) {
4191 EVT SrcVT = Src.getValueType();
4192 if (SrcVT.getScalarSizeInBits() > 32 &&
4193 (Src.getOpcode() == ISD::SRL ||
4194 Src.getOpcode() == ISD::SRA ||
4195 Src.getOpcode() == ISD::SHL)) {
4196 SDValue Amt = Src.getOperand(1);
4197 KnownBits Known = DAG.computeKnownBits(Amt);
4198
4199 // - For left shifts, do the transform as long as the shift
4200 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4201 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4202 // losing information stored in the high bits when truncating.
4203 const unsigned MaxCstSize =
4204 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4205 if (Known.getMaxValue().ule(MaxCstSize)) {
4206 EVT MidVT = VT.isVector() ?
4207 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4208 VT.getVectorNumElements()) : MVT::i32;
4209
4210 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4211 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4212 Src.getOperand(0));
4213 DCI.AddToWorklist(Trunc.getNode());
4214
4215 if (Amt.getValueType() != NewShiftVT) {
4216 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4217 DCI.AddToWorklist(Amt.getNode());
4218 }
4219
4220 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4221 Trunc, Amt);
4222 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4223 }
4224 }
4225 }
4226
4227 return SDValue();
4228 }
4229
4230 // We need to specifically handle i64 mul here to avoid unnecessary conversion
4231 // instructions. If we only match on the legalized i64 mul expansion,
4232 // SimplifyDemandedBits will be unable to remove them because there will be
4233 // multiple uses due to the separate mul + mulh[su].
getMul24(SelectionDAG & DAG,const SDLoc & SL,SDValue N0,SDValue N1,unsigned Size,bool Signed)4234 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4235 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4236 if (Size <= 32) {
4237 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4238 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4239 }
4240
4241 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4242 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4243
4244 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4245 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4246
4247 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4248 }
4249
4250 /// If \p V is an add of a constant 1, returns the other operand. Otherwise
4251 /// return SDValue().
getAddOneOp(const SDNode * V)4252 static SDValue getAddOneOp(const SDNode *V) {
4253 if (V->getOpcode() != ISD::ADD)
4254 return SDValue();
4255
4256 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4257 }
4258
performMulCombine(SDNode * N,DAGCombinerInfo & DCI) const4259 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4260 DAGCombinerInfo &DCI) const {
4261 assert(N->getOpcode() == ISD::MUL);
4262 EVT VT = N->getValueType(0);
4263
4264 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4265 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4266 // unnecessarily). isDivergent() is used as an approximation of whether the
4267 // value is in an SGPR.
4268 if (!N->isDivergent())
4269 return SDValue();
4270
4271 unsigned Size = VT.getSizeInBits();
4272 if (VT.isVector() || Size > 64)
4273 return SDValue();
4274
4275 SelectionDAG &DAG = DCI.DAG;
4276 SDLoc DL(N);
4277
4278 SDValue N0 = N->getOperand(0);
4279 SDValue N1 = N->getOperand(1);
4280
4281 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4282 // matching.
4283
4284 // mul x, (add y, 1) -> add (mul x, y), x
4285 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4286 SDValue AddOp = getAddOneOp(V.getNode());
4287 if (!AddOp)
4288 return SDValue();
4289
4290 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4291 return U->getOpcode() == ISD::MUL;
4292 }))
4293 return AddOp;
4294
4295 return SDValue();
4296 };
4297
4298 // FIXME: The selection pattern is not properly checking for commuted
4299 // operands, so we have to place the mul in the LHS
4300 if (SDValue MulOper = IsFoldableAdd(N0)) {
4301 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4302 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4303 }
4304
4305 if (SDValue MulOper = IsFoldableAdd(N1)) {
4306 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4307 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4308 }
4309
4310 // There are i16 integer mul/mad.
4311 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4312 return SDValue();
4313
4314 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4315 // in the source into any_extends if the result of the mul is truncated. Since
4316 // we can assume the high bits are whatever we want, use the underlying value
4317 // to avoid the unknown high bits from interfering.
4318 if (N0.getOpcode() == ISD::ANY_EXTEND)
4319 N0 = N0.getOperand(0);
4320
4321 if (N1.getOpcode() == ISD::ANY_EXTEND)
4322 N1 = N1.getOperand(0);
4323
4324 SDValue Mul;
4325
4326 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4327 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4328 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4329 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4330 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4331 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4332 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4333 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4334 } else {
4335 return SDValue();
4336 }
4337
4338 // We need to use sext even for MUL_U24, because MUL_U24 is used
4339 // for signed multiply of 8 and 16-bit types.
4340 return DAG.getSExtOrTrunc(Mul, DL, VT);
4341 }
4342
4343 SDValue
performMulLoHiCombine(SDNode * N,DAGCombinerInfo & DCI) const4344 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4345 DAGCombinerInfo &DCI) const {
4346 if (N->getValueType(0) != MVT::i32)
4347 return SDValue();
4348
4349 SelectionDAG &DAG = DCI.DAG;
4350 SDLoc DL(N);
4351
4352 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4353 SDValue N0 = N->getOperand(0);
4354 SDValue N1 = N->getOperand(1);
4355
4356 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4357 // in the source into any_extends if the result of the mul is truncated. Since
4358 // we can assume the high bits are whatever we want, use the underlying value
4359 // to avoid the unknown high bits from interfering.
4360 if (N0.getOpcode() == ISD::ANY_EXTEND)
4361 N0 = N0.getOperand(0);
4362 if (N1.getOpcode() == ISD::ANY_EXTEND)
4363 N1 = N1.getOperand(0);
4364
4365 // Try to use two fast 24-bit multiplies (one for each half of the result)
4366 // instead of one slow extending multiply.
4367 unsigned LoOpcode = 0;
4368 unsigned HiOpcode = 0;
4369 if (Signed) {
4370 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4371 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4372 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4373 LoOpcode = AMDGPUISD::MUL_I24;
4374 HiOpcode = AMDGPUISD::MULHI_I24;
4375 }
4376 } else {
4377 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4378 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4379 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4380 LoOpcode = AMDGPUISD::MUL_U24;
4381 HiOpcode = AMDGPUISD::MULHI_U24;
4382 }
4383 }
4384 if (!LoOpcode)
4385 return SDValue();
4386
4387 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4388 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4389 DCI.CombineTo(N, Lo, Hi);
4390 return SDValue(N, 0);
4391 }
4392
performMulhsCombine(SDNode * N,DAGCombinerInfo & DCI) const4393 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4394 DAGCombinerInfo &DCI) const {
4395 EVT VT = N->getValueType(0);
4396
4397 if (!Subtarget->hasMulI24() || VT.isVector())
4398 return SDValue();
4399
4400 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4401 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4402 // unnecessarily). isDivergent() is used as an approximation of whether the
4403 // value is in an SGPR.
4404 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4405 // valu op anyway)
4406 if (Subtarget->hasSMulHi() && !N->isDivergent())
4407 return SDValue();
4408
4409 SelectionDAG &DAG = DCI.DAG;
4410 SDLoc DL(N);
4411
4412 SDValue N0 = N->getOperand(0);
4413 SDValue N1 = N->getOperand(1);
4414
4415 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4416 return SDValue();
4417
4418 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4419 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4420
4421 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4422 DCI.AddToWorklist(Mulhi.getNode());
4423 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4424 }
4425
performMulhuCombine(SDNode * N,DAGCombinerInfo & DCI) const4426 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4427 DAGCombinerInfo &DCI) const {
4428 EVT VT = N->getValueType(0);
4429
4430 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4431 return SDValue();
4432
4433 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4434 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4435 // unnecessarily). isDivergent() is used as an approximation of whether the
4436 // value is in an SGPR.
4437 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4438 // valu op anyway)
4439 if (Subtarget->hasSMulHi() && !N->isDivergent())
4440 return SDValue();
4441
4442 SelectionDAG &DAG = DCI.DAG;
4443 SDLoc DL(N);
4444
4445 SDValue N0 = N->getOperand(0);
4446 SDValue N1 = N->getOperand(1);
4447
4448 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4449 return SDValue();
4450
4451 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4452 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4453
4454 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4455 DCI.AddToWorklist(Mulhi.getNode());
4456 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4457 }
4458
getFFBX_U32(SelectionDAG & DAG,SDValue Op,const SDLoc & DL,unsigned Opc) const4459 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4460 SDValue Op,
4461 const SDLoc &DL,
4462 unsigned Opc) const {
4463 EVT VT = Op.getValueType();
4464 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4465 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4466 LegalVT != MVT::i16))
4467 return SDValue();
4468
4469 if (VT != MVT::i32)
4470 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4471
4472 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4473 if (VT != MVT::i32)
4474 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4475
4476 return FFBX;
4477 }
4478
4479 // The native instructions return -1 on 0 input. Optimize out a select that
4480 // produces -1 on 0.
4481 //
4482 // TODO: If zero is not undef, we could also do this if the output is compared
4483 // against the bitwidth.
4484 //
4485 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
performCtlz_CttzCombine(const SDLoc & SL,SDValue Cond,SDValue LHS,SDValue RHS,DAGCombinerInfo & DCI) const4486 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4487 SDValue LHS, SDValue RHS,
4488 DAGCombinerInfo &DCI) const {
4489 if (!isNullConstant(Cond.getOperand(1)))
4490 return SDValue();
4491
4492 SelectionDAG &DAG = DCI.DAG;
4493 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4494 SDValue CmpLHS = Cond.getOperand(0);
4495
4496 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4497 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4498 if (CCOpcode == ISD::SETEQ &&
4499 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4500 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4501 unsigned Opc =
4502 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4503 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4504 }
4505
4506 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4507 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4508 if (CCOpcode == ISD::SETNE &&
4509 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4510 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4511 unsigned Opc =
4512 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4513
4514 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4515 }
4516
4517 return SDValue();
4518 }
4519
distributeOpThroughSelect(TargetLowering::DAGCombinerInfo & DCI,unsigned Op,const SDLoc & SL,SDValue Cond,SDValue N1,SDValue N2)4520 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4521 unsigned Op,
4522 const SDLoc &SL,
4523 SDValue Cond,
4524 SDValue N1,
4525 SDValue N2) {
4526 SelectionDAG &DAG = DCI.DAG;
4527 EVT VT = N1.getValueType();
4528
4529 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4530 N1.getOperand(0), N2.getOperand(0));
4531 DCI.AddToWorklist(NewSelect.getNode());
4532 return DAG.getNode(Op, SL, VT, NewSelect);
4533 }
4534
4535 // Pull a free FP operation out of a select so it may fold into uses.
4536 //
4537 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4538 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
4539 //
4540 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4541 // select c, (fabs x), +k -> fabs (select c, x, k)
4542 SDValue
foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo & DCI,SDValue N) const4543 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4544 SDValue N) const {
4545 SelectionDAG &DAG = DCI.DAG;
4546 SDValue Cond = N.getOperand(0);
4547 SDValue LHS = N.getOperand(1);
4548 SDValue RHS = N.getOperand(2);
4549
4550 EVT VT = N.getValueType();
4551 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4552 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4553 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4554 return SDValue();
4555
4556 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4557 SDLoc(N), Cond, LHS, RHS);
4558 }
4559
4560 bool Inv = false;
4561 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4562 std::swap(LHS, RHS);
4563 Inv = true;
4564 }
4565
4566 // TODO: Support vector constants.
4567 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4568 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4569 !selectSupportsSourceMods(N.getNode())) {
4570 SDLoc SL(N);
4571 // If one side is an fneg/fabs and the other is a constant, we can push the
4572 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4573 SDValue NewLHS = LHS.getOperand(0);
4574 SDValue NewRHS = RHS;
4575
4576 // Careful: if the neg can be folded up, don't try to pull it back down.
4577 bool ShouldFoldNeg = true;
4578
4579 if (NewLHS.hasOneUse()) {
4580 unsigned Opc = NewLHS.getOpcode();
4581 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4582 ShouldFoldNeg = false;
4583 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4584 ShouldFoldNeg = false;
4585 }
4586
4587 if (ShouldFoldNeg) {
4588 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4589 return SDValue();
4590
4591 // We're going to be forced to use a source modifier anyway, there's no
4592 // point to pulling the negate out unless we can get a size reduction by
4593 // negating the constant.
4594 //
4595 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4596 // about cheaper constants.
4597 if (NewLHS.getOpcode() == ISD::FABS &&
4598 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
4599 return SDValue();
4600
4601 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4602 return SDValue();
4603
4604 if (LHS.getOpcode() == ISD::FNEG)
4605 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4606
4607 if (Inv)
4608 std::swap(NewLHS, NewRHS);
4609
4610 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4611 Cond, NewLHS, NewRHS);
4612 DCI.AddToWorklist(NewSelect.getNode());
4613 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4614 }
4615 }
4616
4617 return SDValue();
4618 }
4619
performSelectCombine(SDNode * N,DAGCombinerInfo & DCI) const4620 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4621 DAGCombinerInfo &DCI) const {
4622 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4623 return Folded;
4624
4625 SDValue Cond = N->getOperand(0);
4626 if (Cond.getOpcode() != ISD::SETCC)
4627 return SDValue();
4628
4629 EVT VT = N->getValueType(0);
4630 SDValue LHS = Cond.getOperand(0);
4631 SDValue RHS = Cond.getOperand(1);
4632 SDValue CC = Cond.getOperand(2);
4633
4634 SDValue True = N->getOperand(1);
4635 SDValue False = N->getOperand(2);
4636
4637 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4638 SelectionDAG &DAG = DCI.DAG;
4639 if (DAG.isConstantValueOfAnyType(True) &&
4640 !DAG.isConstantValueOfAnyType(False)) {
4641 // Swap cmp + select pair to move constant to false input.
4642 // This will allow using VOPC cndmasks more often.
4643 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4644
4645 SDLoc SL(N);
4646 ISD::CondCode NewCC =
4647 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4648
4649 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4650 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4651 }
4652
4653 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4654 SDValue MinMax
4655 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4656 // Revisit this node so we can catch min3/max3/med3 patterns.
4657 //DCI.AddToWorklist(MinMax.getNode());
4658 return MinMax;
4659 }
4660 }
4661
4662 // There's no reason to not do this if the condition has other uses.
4663 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4664 }
4665
isInv2Pi(const APFloat & APF)4666 static bool isInv2Pi(const APFloat &APF) {
4667 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4668 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4669 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4670
4671 return APF.bitwiseIsEqual(KF16) ||
4672 APF.bitwiseIsEqual(KF32) ||
4673 APF.bitwiseIsEqual(KF64);
4674 }
4675
4676 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4677 // additional cost to negate them.
4678 TargetLowering::NegatibleCost
getConstantNegateCost(const ConstantFPSDNode * C) const4679 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4680 if (C->isZero())
4681 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4682
4683 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4684 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4685
4686 return NegatibleCost::Neutral;
4687 }
4688
isConstantCostlierToNegate(SDValue N) const4689 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4690 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4691 return getConstantNegateCost(C) == NegatibleCost::Expensive;
4692 return false;
4693 }
4694
isConstantCheaperToNegate(SDValue N) const4695 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4696 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4697 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4698 return false;
4699 }
4700
inverseMinMax(unsigned Opc)4701 static unsigned inverseMinMax(unsigned Opc) {
4702 switch (Opc) {
4703 case ISD::FMAXNUM:
4704 return ISD::FMINNUM;
4705 case ISD::FMINNUM:
4706 return ISD::FMAXNUM;
4707 case ISD::FMAXNUM_IEEE:
4708 return ISD::FMINNUM_IEEE;
4709 case ISD::FMINNUM_IEEE:
4710 return ISD::FMAXNUM_IEEE;
4711 case ISD::FMAXIMUM:
4712 return ISD::FMINIMUM;
4713 case ISD::FMINIMUM:
4714 return ISD::FMAXIMUM;
4715 case AMDGPUISD::FMAX_LEGACY:
4716 return AMDGPUISD::FMIN_LEGACY;
4717 case AMDGPUISD::FMIN_LEGACY:
4718 return AMDGPUISD::FMAX_LEGACY;
4719 default:
4720 llvm_unreachable("invalid min/max opcode");
4721 }
4722 }
4723
4724 /// \return true if it's profitable to try to push an fneg into its source
4725 /// instruction.
shouldFoldFNegIntoSrc(SDNode * N,SDValue N0)4726 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4727 // If the input has multiple uses and we can either fold the negate down, or
4728 // the other uses cannot, give up. This both prevents unprofitable
4729 // transformations and infinite loops: we won't repeatedly try to fold around
4730 // a negate that has no 'good' form.
4731 if (N0.hasOneUse()) {
4732 // This may be able to fold into the source, but at a code size cost. Don't
4733 // fold if the fold into the user is free.
4734 if (allUsesHaveSourceMods(N, 0))
4735 return false;
4736 } else {
4737 if (fnegFoldsIntoOp(N0.getNode()) &&
4738 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
4739 return false;
4740 }
4741
4742 return true;
4743 }
4744
performFNegCombine(SDNode * N,DAGCombinerInfo & DCI) const4745 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4746 DAGCombinerInfo &DCI) const {
4747 SelectionDAG &DAG = DCI.DAG;
4748 SDValue N0 = N->getOperand(0);
4749 EVT VT = N->getValueType(0);
4750
4751 unsigned Opc = N0.getOpcode();
4752
4753 if (!shouldFoldFNegIntoSrc(N, N0))
4754 return SDValue();
4755
4756 SDLoc SL(N);
4757 switch (Opc) {
4758 case ISD::FADD: {
4759 if (!mayIgnoreSignedZero(N0))
4760 return SDValue();
4761
4762 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4763 SDValue LHS = N0.getOperand(0);
4764 SDValue RHS = N0.getOperand(1);
4765
4766 if (LHS.getOpcode() != ISD::FNEG)
4767 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4768 else
4769 LHS = LHS.getOperand(0);
4770
4771 if (RHS.getOpcode() != ISD::FNEG)
4772 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4773 else
4774 RHS = RHS.getOperand(0);
4775
4776 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4777 if (Res.getOpcode() != ISD::FADD)
4778 return SDValue(); // Op got folded away.
4779 if (!N0.hasOneUse())
4780 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4781 return Res;
4782 }
4783 case ISD::FMUL:
4784 case AMDGPUISD::FMUL_LEGACY: {
4785 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4786 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4787 SDValue LHS = N0.getOperand(0);
4788 SDValue RHS = N0.getOperand(1);
4789
4790 if (LHS.getOpcode() == ISD::FNEG)
4791 LHS = LHS.getOperand(0);
4792 else if (RHS.getOpcode() == ISD::FNEG)
4793 RHS = RHS.getOperand(0);
4794 else
4795 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4796
4797 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4798 if (Res.getOpcode() != Opc)
4799 return SDValue(); // Op got folded away.
4800 if (!N0.hasOneUse())
4801 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4802 return Res;
4803 }
4804 case ISD::FMA:
4805 case ISD::FMAD: {
4806 // TODO: handle llvm.amdgcn.fma.legacy
4807 if (!mayIgnoreSignedZero(N0))
4808 return SDValue();
4809
4810 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4811 SDValue LHS = N0.getOperand(0);
4812 SDValue MHS = N0.getOperand(1);
4813 SDValue RHS = N0.getOperand(2);
4814
4815 if (LHS.getOpcode() == ISD::FNEG)
4816 LHS = LHS.getOperand(0);
4817 else if (MHS.getOpcode() == ISD::FNEG)
4818 MHS = MHS.getOperand(0);
4819 else
4820 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4821
4822 if (RHS.getOpcode() != ISD::FNEG)
4823 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4824 else
4825 RHS = RHS.getOperand(0);
4826
4827 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4828 if (Res.getOpcode() != Opc)
4829 return SDValue(); // Op got folded away.
4830 if (!N0.hasOneUse())
4831 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4832 return Res;
4833 }
4834 case ISD::FMAXNUM:
4835 case ISD::FMINNUM:
4836 case ISD::FMAXNUM_IEEE:
4837 case ISD::FMINNUM_IEEE:
4838 case ISD::FMINIMUM:
4839 case ISD::FMAXIMUM:
4840 case AMDGPUISD::FMAX_LEGACY:
4841 case AMDGPUISD::FMIN_LEGACY: {
4842 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4843 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4844 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4845 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4846
4847 SDValue LHS = N0.getOperand(0);
4848 SDValue RHS = N0.getOperand(1);
4849
4850 // 0 doesn't have a negated inline immediate.
4851 // TODO: This constant check should be generalized to other operations.
4852 if (isConstantCostlierToNegate(RHS))
4853 return SDValue();
4854
4855 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4856 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4857 unsigned Opposite = inverseMinMax(Opc);
4858
4859 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4860 if (Res.getOpcode() != Opposite)
4861 return SDValue(); // Op got folded away.
4862 if (!N0.hasOneUse())
4863 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4864 return Res;
4865 }
4866 case AMDGPUISD::FMED3: {
4867 SDValue Ops[3];
4868 for (unsigned I = 0; I < 3; ++I)
4869 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4870
4871 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4872 if (Res.getOpcode() != AMDGPUISD::FMED3)
4873 return SDValue(); // Op got folded away.
4874
4875 if (!N0.hasOneUse()) {
4876 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4877 DAG.ReplaceAllUsesWith(N0, Neg);
4878
4879 for (SDNode *U : Neg->uses())
4880 DCI.AddToWorklist(U);
4881 }
4882
4883 return Res;
4884 }
4885 case ISD::FP_EXTEND:
4886 case ISD::FTRUNC:
4887 case ISD::FRINT:
4888 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4889 case ISD::FROUNDEVEN:
4890 case ISD::FSIN:
4891 case ISD::FCANONICALIZE:
4892 case AMDGPUISD::RCP:
4893 case AMDGPUISD::RCP_LEGACY:
4894 case AMDGPUISD::RCP_IFLAG:
4895 case AMDGPUISD::SIN_HW: {
4896 SDValue CvtSrc = N0.getOperand(0);
4897 if (CvtSrc.getOpcode() == ISD::FNEG) {
4898 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4899 // (fneg (rcp (fneg x))) -> (rcp x)
4900 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4901 }
4902
4903 if (!N0.hasOneUse())
4904 return SDValue();
4905
4906 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4907 // (fneg (rcp x)) -> (rcp (fneg x))
4908 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4909 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4910 }
4911 case ISD::FP_ROUND: {
4912 SDValue CvtSrc = N0.getOperand(0);
4913
4914 if (CvtSrc.getOpcode() == ISD::FNEG) {
4915 // (fneg (fp_round (fneg x))) -> (fp_round x)
4916 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4917 CvtSrc.getOperand(0), N0.getOperand(1));
4918 }
4919
4920 if (!N0.hasOneUse())
4921 return SDValue();
4922
4923 // (fneg (fp_round x)) -> (fp_round (fneg x))
4924 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4925 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4926 }
4927 case ISD::FP16_TO_FP: {
4928 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4929 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4930 // Put the fneg back as a legal source operation that can be matched later.
4931 SDLoc SL(N);
4932
4933 SDValue Src = N0.getOperand(0);
4934 EVT SrcVT = Src.getValueType();
4935
4936 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4937 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4938 DAG.getConstant(0x8000, SL, SrcVT));
4939 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4940 }
4941 case ISD::SELECT: {
4942 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4943 // TODO: Invert conditions of foldFreeOpFromSelect
4944 return SDValue();
4945 }
4946 case ISD::BITCAST: {
4947 SDLoc SL(N);
4948 SDValue BCSrc = N0.getOperand(0);
4949 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4950 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4951 if (HighBits.getValueType().getSizeInBits() != 32 ||
4952 !fnegFoldsIntoOp(HighBits.getNode()))
4953 return SDValue();
4954
4955 // f64 fneg only really needs to operate on the high half of of the
4956 // register, so try to force it to an f32 operation to help make use of
4957 // source modifiers.
4958 //
4959 //
4960 // fneg (f64 (bitcast (build_vector x, y))) ->
4961 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4962 // (fneg (bitcast i32:y to f32)))
4963
4964 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4965 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4966 SDValue CastBack =
4967 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4968
4969 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4970 Ops.back() = CastBack;
4971 DCI.AddToWorklist(NegHi.getNode());
4972 SDValue Build =
4973 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4974 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4975
4976 if (!N0.hasOneUse())
4977 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4978 return Result;
4979 }
4980
4981 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4982 BCSrc.hasOneUse()) {
4983 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4984 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4985
4986 // TODO: Cast back result for multiple uses is beneficial in some cases.
4987
4988 SDValue LHS =
4989 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4990 SDValue RHS =
4991 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4992
4993 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4994 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4995
4996 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4997 NegRHS);
4998 }
4999
5000 return SDValue();
5001 }
5002 default:
5003 return SDValue();
5004 }
5005 }
5006
performFAbsCombine(SDNode * N,DAGCombinerInfo & DCI) const5007 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5008 DAGCombinerInfo &DCI) const {
5009 SelectionDAG &DAG = DCI.DAG;
5010 SDValue N0 = N->getOperand(0);
5011
5012 if (!N0.hasOneUse())
5013 return SDValue();
5014
5015 switch (N0.getOpcode()) {
5016 case ISD::FP16_TO_FP: {
5017 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5018 SDLoc SL(N);
5019 SDValue Src = N0.getOperand(0);
5020 EVT SrcVT = Src.getValueType();
5021
5022 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5023 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5024 DAG.getConstant(0x7fff, SL, SrcVT));
5025 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5026 }
5027 default:
5028 return SDValue();
5029 }
5030 }
5031
performRcpCombine(SDNode * N,DAGCombinerInfo & DCI) const5032 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5033 DAGCombinerInfo &DCI) const {
5034 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5035 if (!CFP)
5036 return SDValue();
5037
5038 // XXX - Should this flush denormals?
5039 const APFloat &Val = CFP->getValueAPF();
5040 APFloat One(Val.getSemantics(), "1.0");
5041 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5042 }
5043
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const5044 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5045 DAGCombinerInfo &DCI) const {
5046 SelectionDAG &DAG = DCI.DAG;
5047 SDLoc DL(N);
5048
5049 switch(N->getOpcode()) {
5050 default:
5051 break;
5052 case ISD::BITCAST: {
5053 EVT DestVT = N->getValueType(0);
5054
5055 // Push casts through vector builds. This helps avoid emitting a large
5056 // number of copies when materializing floating point vector constants.
5057 //
5058 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5059 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5060 if (DestVT.isVector()) {
5061 SDValue Src = N->getOperand(0);
5062 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5063 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5064 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
5065 EVT SrcVT = Src.getValueType();
5066 unsigned NElts = DestVT.getVectorNumElements();
5067
5068 if (SrcVT.getVectorNumElements() == NElts) {
5069 EVT DestEltVT = DestVT.getVectorElementType();
5070
5071 SmallVector<SDValue, 8> CastedElts;
5072 SDLoc SL(N);
5073 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5074 SDValue Elt = Src.getOperand(I);
5075 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5076 }
5077
5078 return DAG.getBuildVector(DestVT, SL, CastedElts);
5079 }
5080 }
5081 }
5082
5083 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5084 break;
5085
5086 // Fold bitcasts of constants.
5087 //
5088 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5089 // TODO: Generalize and move to DAGCombiner
5090 SDValue Src = N->getOperand(0);
5091 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5092 SDLoc SL(N);
5093 uint64_t CVal = C->getZExtValue();
5094 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5095 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5096 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5097 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5098 }
5099
5100 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5101 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5102 SDLoc SL(N);
5103 uint64_t CVal = Val.getZExtValue();
5104 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5105 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5106 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5107
5108 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5109 }
5110
5111 break;
5112 }
5113 case ISD::SHL: {
5114 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5115 break;
5116
5117 return performShlCombine(N, DCI);
5118 }
5119 case ISD::SRL: {
5120 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5121 break;
5122
5123 return performSrlCombine(N, DCI);
5124 }
5125 case ISD::SRA: {
5126 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5127 break;
5128
5129 return performSraCombine(N, DCI);
5130 }
5131 case ISD::TRUNCATE:
5132 return performTruncateCombine(N, DCI);
5133 case ISD::MUL:
5134 return performMulCombine(N, DCI);
5135 case AMDGPUISD::MUL_U24:
5136 case AMDGPUISD::MUL_I24: {
5137 if (SDValue Simplified = simplifyMul24(N, DCI))
5138 return Simplified;
5139 break;
5140 }
5141 case AMDGPUISD::MULHI_I24:
5142 case AMDGPUISD::MULHI_U24:
5143 return simplifyMul24(N, DCI);
5144 case ISD::SMUL_LOHI:
5145 case ISD::UMUL_LOHI:
5146 return performMulLoHiCombine(N, DCI);
5147 case ISD::MULHS:
5148 return performMulhsCombine(N, DCI);
5149 case ISD::MULHU:
5150 return performMulhuCombine(N, DCI);
5151 case ISD::SELECT:
5152 return performSelectCombine(N, DCI);
5153 case ISD::FNEG:
5154 return performFNegCombine(N, DCI);
5155 case ISD::FABS:
5156 return performFAbsCombine(N, DCI);
5157 case AMDGPUISD::BFE_I32:
5158 case AMDGPUISD::BFE_U32: {
5159 assert(!N->getValueType(0).isVector() &&
5160 "Vector handling of BFE not implemented");
5161 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5162 if (!Width)
5163 break;
5164
5165 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5166 if (WidthVal == 0)
5167 return DAG.getConstant(0, DL, MVT::i32);
5168
5169 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5170 if (!Offset)
5171 break;
5172
5173 SDValue BitsFrom = N->getOperand(0);
5174 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5175
5176 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5177
5178 if (OffsetVal == 0) {
5179 // This is already sign / zero extended, so try to fold away extra BFEs.
5180 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5181
5182 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5183 if (OpSignBits >= SignBits)
5184 return BitsFrom;
5185
5186 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5187 if (Signed) {
5188 // This is a sign_extend_inreg. Replace it to take advantage of existing
5189 // DAG Combines. If not eliminated, we will match back to BFE during
5190 // selection.
5191
5192 // TODO: The sext_inreg of extended types ends, although we can could
5193 // handle them in a single BFE.
5194 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5195 DAG.getValueType(SmallVT));
5196 }
5197
5198 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5199 }
5200
5201 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5202 if (Signed) {
5203 return constantFoldBFE<int32_t>(DAG,
5204 CVal->getSExtValue(),
5205 OffsetVal,
5206 WidthVal,
5207 DL);
5208 }
5209
5210 return constantFoldBFE<uint32_t>(DAG,
5211 CVal->getZExtValue(),
5212 OffsetVal,
5213 WidthVal,
5214 DL);
5215 }
5216
5217 if ((OffsetVal + WidthVal) >= 32 &&
5218 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5219 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5220 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5221 BitsFrom, ShiftVal);
5222 }
5223
5224 if (BitsFrom.hasOneUse()) {
5225 APInt Demanded = APInt::getBitsSet(32,
5226 OffsetVal,
5227 OffsetVal + WidthVal);
5228
5229 KnownBits Known;
5230 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5231 !DCI.isBeforeLegalizeOps());
5232 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5233 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5234 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5235 DCI.CommitTargetLoweringOpt(TLO);
5236 }
5237 }
5238
5239 break;
5240 }
5241 case ISD::LOAD:
5242 return performLoadCombine(N, DCI);
5243 case ISD::STORE:
5244 return performStoreCombine(N, DCI);
5245 case AMDGPUISD::RCP:
5246 case AMDGPUISD::RCP_IFLAG:
5247 return performRcpCombine(N, DCI);
5248 case ISD::AssertZext:
5249 case ISD::AssertSext:
5250 return performAssertSZExtCombine(N, DCI);
5251 case ISD::INTRINSIC_WO_CHAIN:
5252 return performIntrinsicWOChainCombine(N, DCI);
5253 case AMDGPUISD::FMAD_FTZ: {
5254 SDValue N0 = N->getOperand(0);
5255 SDValue N1 = N->getOperand(1);
5256 SDValue N2 = N->getOperand(2);
5257 EVT VT = N->getValueType(0);
5258
5259 // FMAD_FTZ is a FMAD + flush denormals to zero.
5260 // We flush the inputs, the intermediate step, and the output.
5261 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5262 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5263 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5264 if (N0CFP && N1CFP && N2CFP) {
5265 const auto FTZ = [](const APFloat &V) {
5266 if (V.isDenormal()) {
5267 APFloat Zero(V.getSemantics(), 0);
5268 return V.isNegative() ? -Zero : Zero;
5269 }
5270 return V;
5271 };
5272
5273 APFloat V0 = FTZ(N0CFP->getValueAPF());
5274 APFloat V1 = FTZ(N1CFP->getValueAPF());
5275 APFloat V2 = FTZ(N2CFP->getValueAPF());
5276 V0.multiply(V1, APFloat::rmNearestTiesToEven);
5277 V0 = FTZ(V0);
5278 V0.add(V2, APFloat::rmNearestTiesToEven);
5279 return DAG.getConstantFP(FTZ(V0), DL, VT);
5280 }
5281 break;
5282 }
5283 }
5284 return SDValue();
5285 }
5286
5287 //===----------------------------------------------------------------------===//
5288 // Helper functions
5289 //===----------------------------------------------------------------------===//
5290
CreateLiveInRegister(SelectionDAG & DAG,const TargetRegisterClass * RC,Register Reg,EVT VT,const SDLoc & SL,bool RawReg) const5291 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5292 const TargetRegisterClass *RC,
5293 Register Reg, EVT VT,
5294 const SDLoc &SL,
5295 bool RawReg) const {
5296 MachineFunction &MF = DAG.getMachineFunction();
5297 MachineRegisterInfo &MRI = MF.getRegInfo();
5298 Register VReg;
5299
5300 if (!MRI.isLiveIn(Reg)) {
5301 VReg = MRI.createVirtualRegister(RC);
5302 MRI.addLiveIn(Reg, VReg);
5303 } else {
5304 VReg = MRI.getLiveInVirtReg(Reg);
5305 }
5306
5307 if (RawReg)
5308 return DAG.getRegister(VReg, VT);
5309
5310 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5311 }
5312
5313 // This may be called multiple times, and nothing prevents creating multiple
5314 // objects at the same offset. See if we already defined this object.
getOrCreateFixedStackObject(MachineFrameInfo & MFI,unsigned Size,int64_t Offset)5315 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5316 int64_t Offset) {
5317 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5318 if (MFI.getObjectOffset(I) == Offset) {
5319 assert(MFI.getObjectSize(I) == Size);
5320 return I;
5321 }
5322 }
5323
5324 return MFI.CreateFixedObject(Size, Offset, true);
5325 }
5326
loadStackInputValue(SelectionDAG & DAG,EVT VT,const SDLoc & SL,int64_t Offset) const5327 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5328 EVT VT,
5329 const SDLoc &SL,
5330 int64_t Offset) const {
5331 MachineFunction &MF = DAG.getMachineFunction();
5332 MachineFrameInfo &MFI = MF.getFrameInfo();
5333 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5334
5335 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5336 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5337
5338 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5339 MachineMemOperand::MODereferenceable |
5340 MachineMemOperand::MOInvariant);
5341 }
5342
storeStackInputValue(SelectionDAG & DAG,const SDLoc & SL,SDValue Chain,SDValue ArgVal,int64_t Offset) const5343 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5344 const SDLoc &SL,
5345 SDValue Chain,
5346 SDValue ArgVal,
5347 int64_t Offset) const {
5348 MachineFunction &MF = DAG.getMachineFunction();
5349 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5350 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5351
5352 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5353 // Stores to the argument stack area are relative to the stack pointer.
5354 SDValue SP =
5355 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5356 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5357 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5358 MachineMemOperand::MODereferenceable);
5359 return Store;
5360 }
5361
loadInputValue(SelectionDAG & DAG,const TargetRegisterClass * RC,EVT VT,const SDLoc & SL,const ArgDescriptor & Arg) const5362 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5363 const TargetRegisterClass *RC,
5364 EVT VT, const SDLoc &SL,
5365 const ArgDescriptor &Arg) const {
5366 assert(Arg && "Attempting to load missing argument");
5367
5368 SDValue V = Arg.isRegister() ?
5369 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5370 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5371
5372 if (!Arg.isMasked())
5373 return V;
5374
5375 unsigned Mask = Arg.getMask();
5376 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5377 V = DAG.getNode(ISD::SRL, SL, VT, V,
5378 DAG.getShiftAmountConstant(Shift, VT, SL));
5379 return DAG.getNode(ISD::AND, SL, VT, V,
5380 DAG.getConstant(Mask >> Shift, SL, VT));
5381 }
5382
getImplicitParameterOffset(uint64_t ExplicitKernArgSize,const ImplicitParameter Param) const5383 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5384 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5385 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5386 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5387 uint64_t ArgOffset =
5388 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5389 switch (Param) {
5390 case FIRST_IMPLICIT:
5391 return ArgOffset;
5392 case PRIVATE_BASE:
5393 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5394 case SHARED_BASE:
5395 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5396 case QUEUE_PTR:
5397 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5398 }
5399 llvm_unreachable("unexpected implicit parameter type");
5400 }
5401
getImplicitParameterOffset(const MachineFunction & MF,const ImplicitParameter Param) const5402 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5403 const MachineFunction &MF, const ImplicitParameter Param) const {
5404 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5405 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
5406 }
5407
5408 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5409
getTargetNodeName(unsigned Opcode) const5410 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5411 switch ((AMDGPUISD::NodeType)Opcode) {
5412 case AMDGPUISD::FIRST_NUMBER: break;
5413 // AMDIL DAG nodes
5414 NODE_NAME_CASE(UMUL);
5415 NODE_NAME_CASE(BRANCH_COND);
5416
5417 // AMDGPU DAG nodes
5418 NODE_NAME_CASE(IF)
5419 NODE_NAME_CASE(ELSE)
5420 NODE_NAME_CASE(LOOP)
5421 NODE_NAME_CASE(CALL)
5422 NODE_NAME_CASE(TC_RETURN)
5423 NODE_NAME_CASE(TC_RETURN_GFX)
5424 NODE_NAME_CASE(TC_RETURN_CHAIN)
5425 NODE_NAME_CASE(TRAP)
5426 NODE_NAME_CASE(RET_GLUE)
5427 NODE_NAME_CASE(WAVE_ADDRESS)
5428 NODE_NAME_CASE(RETURN_TO_EPILOG)
5429 NODE_NAME_CASE(ENDPGM)
5430 NODE_NAME_CASE(ENDPGM_TRAP)
5431 NODE_NAME_CASE(SIMULATED_TRAP)
5432 NODE_NAME_CASE(DWORDADDR)
5433 NODE_NAME_CASE(FRACT)
5434 NODE_NAME_CASE(SETCC)
5435 NODE_NAME_CASE(SETREG)
5436 NODE_NAME_CASE(DENORM_MODE)
5437 NODE_NAME_CASE(FMA_W_CHAIN)
5438 NODE_NAME_CASE(FMUL_W_CHAIN)
5439 NODE_NAME_CASE(CLAMP)
5440 NODE_NAME_CASE(COS_HW)
5441 NODE_NAME_CASE(SIN_HW)
5442 NODE_NAME_CASE(FMAX_LEGACY)
5443 NODE_NAME_CASE(FMIN_LEGACY)
5444 NODE_NAME_CASE(FMAX3)
5445 NODE_NAME_CASE(SMAX3)
5446 NODE_NAME_CASE(UMAX3)
5447 NODE_NAME_CASE(FMIN3)
5448 NODE_NAME_CASE(SMIN3)
5449 NODE_NAME_CASE(UMIN3)
5450 NODE_NAME_CASE(FMED3)
5451 NODE_NAME_CASE(SMED3)
5452 NODE_NAME_CASE(UMED3)
5453 NODE_NAME_CASE(FMAXIMUM3)
5454 NODE_NAME_CASE(FMINIMUM3)
5455 NODE_NAME_CASE(FDOT2)
5456 NODE_NAME_CASE(URECIP)
5457 NODE_NAME_CASE(DIV_SCALE)
5458 NODE_NAME_CASE(DIV_FMAS)
5459 NODE_NAME_CASE(DIV_FIXUP)
5460 NODE_NAME_CASE(FMAD_FTZ)
5461 NODE_NAME_CASE(RCP)
5462 NODE_NAME_CASE(RSQ)
5463 NODE_NAME_CASE(RCP_LEGACY)
5464 NODE_NAME_CASE(RCP_IFLAG)
5465 NODE_NAME_CASE(LOG)
5466 NODE_NAME_CASE(EXP)
5467 NODE_NAME_CASE(FMUL_LEGACY)
5468 NODE_NAME_CASE(RSQ_CLAMP)
5469 NODE_NAME_CASE(FP_CLASS)
5470 NODE_NAME_CASE(DOT4)
5471 NODE_NAME_CASE(CARRY)
5472 NODE_NAME_CASE(BORROW)
5473 NODE_NAME_CASE(BFE_U32)
5474 NODE_NAME_CASE(BFE_I32)
5475 NODE_NAME_CASE(BFI)
5476 NODE_NAME_CASE(BFM)
5477 NODE_NAME_CASE(FFBH_U32)
5478 NODE_NAME_CASE(FFBH_I32)
5479 NODE_NAME_CASE(FFBL_B32)
5480 NODE_NAME_CASE(MUL_U24)
5481 NODE_NAME_CASE(MUL_I24)
5482 NODE_NAME_CASE(MULHI_U24)
5483 NODE_NAME_CASE(MULHI_I24)
5484 NODE_NAME_CASE(MAD_U24)
5485 NODE_NAME_CASE(MAD_I24)
5486 NODE_NAME_CASE(MAD_I64_I32)
5487 NODE_NAME_CASE(MAD_U64_U32)
5488 NODE_NAME_CASE(PERM)
5489 NODE_NAME_CASE(TEXTURE_FETCH)
5490 NODE_NAME_CASE(R600_EXPORT)
5491 NODE_NAME_CASE(CONST_ADDRESS)
5492 NODE_NAME_CASE(REGISTER_LOAD)
5493 NODE_NAME_CASE(REGISTER_STORE)
5494 NODE_NAME_CASE(SAMPLE)
5495 NODE_NAME_CASE(SAMPLEB)
5496 NODE_NAME_CASE(SAMPLED)
5497 NODE_NAME_CASE(SAMPLEL)
5498 NODE_NAME_CASE(CVT_F32_UBYTE0)
5499 NODE_NAME_CASE(CVT_F32_UBYTE1)
5500 NODE_NAME_CASE(CVT_F32_UBYTE2)
5501 NODE_NAME_CASE(CVT_F32_UBYTE3)
5502 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5503 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5504 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5505 NODE_NAME_CASE(CVT_PK_I16_I32)
5506 NODE_NAME_CASE(CVT_PK_U16_U32)
5507 NODE_NAME_CASE(FP_TO_FP16)
5508 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5509 NODE_NAME_CASE(CONST_DATA_PTR)
5510 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5511 NODE_NAME_CASE(LDS)
5512 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5513 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5514 NODE_NAME_CASE(DUMMY_CHAIN)
5515 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5516 NODE_NAME_CASE(LOAD_D16_HI)
5517 NODE_NAME_CASE(LOAD_D16_LO)
5518 NODE_NAME_CASE(LOAD_D16_HI_I8)
5519 NODE_NAME_CASE(LOAD_D16_HI_U8)
5520 NODE_NAME_CASE(LOAD_D16_LO_I8)
5521 NODE_NAME_CASE(LOAD_D16_LO_U8)
5522 NODE_NAME_CASE(STORE_MSKOR)
5523 NODE_NAME_CASE(LOAD_CONSTANT)
5524 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5525 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5526 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5527 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5528 NODE_NAME_CASE(DS_ORDERED_COUNT)
5529 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5530 NODE_NAME_CASE(BUFFER_LOAD)
5531 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5532 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5533 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5534 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5535 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5536 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5537 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5538 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5539 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5540 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5541 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5542 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5543 NODE_NAME_CASE(SBUFFER_LOAD)
5544 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5545 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5546 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5547 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5548 NODE_NAME_CASE(BUFFER_STORE)
5549 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5550 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5551 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5552 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5554 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5555 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5556 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5557 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5558 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5559 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5560 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5561 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5562 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5563 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5564 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5565 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5566 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5567 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5568 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5569 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5570 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5571
5572 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5573 }
5574 return nullptr;
5575 }
5576
getSqrtEstimate(SDValue Operand,SelectionDAG & DAG,int Enabled,int & RefinementSteps,bool & UseOneConstNR,bool Reciprocal) const5577 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5578 SelectionDAG &DAG, int Enabled,
5579 int &RefinementSteps,
5580 bool &UseOneConstNR,
5581 bool Reciprocal) const {
5582 EVT VT = Operand.getValueType();
5583
5584 if (VT == MVT::f32) {
5585 RefinementSteps = 0;
5586 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5587 }
5588
5589 // TODO: There is also f64 rsq instruction, but the documentation is less
5590 // clear on its precision.
5591
5592 return SDValue();
5593 }
5594
getRecipEstimate(SDValue Operand,SelectionDAG & DAG,int Enabled,int & RefinementSteps) const5595 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5596 SelectionDAG &DAG, int Enabled,
5597 int &RefinementSteps) const {
5598 EVT VT = Operand.getValueType();
5599
5600 if (VT == MVT::f32) {
5601 // Reciprocal, < 1 ulp error.
5602 //
5603 // This reciprocal approximation converges to < 0.5 ulp error with one
5604 // newton rhapson performed with two fused multiple adds (FMAs).
5605
5606 RefinementSteps = 0;
5607 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5608 }
5609
5610 // TODO: There is also f64 rcp instruction, but the documentation is less
5611 // clear on its precision.
5612
5613 return SDValue();
5614 }
5615
workitemIntrinsicDim(unsigned ID)5616 static unsigned workitemIntrinsicDim(unsigned ID) {
5617 switch (ID) {
5618 case Intrinsic::amdgcn_workitem_id_x:
5619 return 0;
5620 case Intrinsic::amdgcn_workitem_id_y:
5621 return 1;
5622 case Intrinsic::amdgcn_workitem_id_z:
5623 return 2;
5624 default:
5625 llvm_unreachable("not a workitem intrinsic");
5626 }
5627 }
5628
computeKnownBitsForTargetNode(const SDValue Op,KnownBits & Known,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const5629 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5630 const SDValue Op, KnownBits &Known,
5631 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5632
5633 Known.resetAll(); // Don't know anything.
5634
5635 unsigned Opc = Op.getOpcode();
5636
5637 switch (Opc) {
5638 default:
5639 break;
5640 case AMDGPUISD::CARRY:
5641 case AMDGPUISD::BORROW: {
5642 Known.Zero = APInt::getHighBitsSet(32, 31);
5643 break;
5644 }
5645
5646 case AMDGPUISD::BFE_I32:
5647 case AMDGPUISD::BFE_U32: {
5648 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5649 if (!CWidth)
5650 return;
5651
5652 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5653
5654 if (Opc == AMDGPUISD::BFE_U32)
5655 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5656
5657 break;
5658 }
5659 case AMDGPUISD::FP_TO_FP16: {
5660 unsigned BitWidth = Known.getBitWidth();
5661
5662 // High bits are zero.
5663 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
5664 break;
5665 }
5666 case AMDGPUISD::MUL_U24:
5667 case AMDGPUISD::MUL_I24: {
5668 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5669 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5670 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5671 RHSKnown.countMinTrailingZeros();
5672 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5673 // Skip extra check if all bits are known zeros.
5674 if (TrailZ >= 32)
5675 break;
5676
5677 // Truncate to 24 bits.
5678 LHSKnown = LHSKnown.trunc(24);
5679 RHSKnown = RHSKnown.trunc(24);
5680
5681 if (Opc == AMDGPUISD::MUL_I24) {
5682 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5683 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5684 unsigned MaxValBits = LHSValBits + RHSValBits;
5685 if (MaxValBits > 32)
5686 break;
5687 unsigned SignBits = 32 - MaxValBits + 1;
5688 bool LHSNegative = LHSKnown.isNegative();
5689 bool LHSNonNegative = LHSKnown.isNonNegative();
5690 bool LHSPositive = LHSKnown.isStrictlyPositive();
5691 bool RHSNegative = RHSKnown.isNegative();
5692 bool RHSNonNegative = RHSKnown.isNonNegative();
5693 bool RHSPositive = RHSKnown.isStrictlyPositive();
5694
5695 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5696 Known.Zero.setHighBits(SignBits);
5697 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5698 Known.One.setHighBits(SignBits);
5699 } else {
5700 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5701 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5702 unsigned MaxValBits = LHSValBits + RHSValBits;
5703 if (MaxValBits >= 32)
5704 break;
5705 Known.Zero.setBitsFrom(MaxValBits);
5706 }
5707 break;
5708 }
5709 case AMDGPUISD::PERM: {
5710 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5711 if (!CMask)
5712 return;
5713
5714 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5715 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5716 unsigned Sel = CMask->getZExtValue();
5717
5718 for (unsigned I = 0; I < 32; I += 8) {
5719 unsigned SelBits = Sel & 0xff;
5720 if (SelBits < 4) {
5721 SelBits *= 8;
5722 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5723 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5724 } else if (SelBits < 7) {
5725 SelBits = (SelBits & 3) * 8;
5726 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5727 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5728 } else if (SelBits == 0x0c) {
5729 Known.Zero |= 0xFFull << I;
5730 } else if (SelBits > 0x0c) {
5731 Known.One |= 0xFFull << I;
5732 }
5733 Sel >>= 8;
5734 }
5735 break;
5736 }
5737 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5738 Known.Zero.setHighBits(24);
5739 break;
5740 }
5741 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5742 Known.Zero.setHighBits(16);
5743 break;
5744 }
5745 case AMDGPUISD::LDS: {
5746 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5747 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5748
5749 Known.Zero.setHighBits(16);
5750 Known.Zero.setLowBits(Log2(Alignment));
5751 break;
5752 }
5753 case AMDGPUISD::SMIN3:
5754 case AMDGPUISD::SMAX3:
5755 case AMDGPUISD::SMED3:
5756 case AMDGPUISD::UMIN3:
5757 case AMDGPUISD::UMAX3:
5758 case AMDGPUISD::UMED3: {
5759 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5760 if (Known2.isUnknown())
5761 break;
5762
5763 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5764 if (Known1.isUnknown())
5765 break;
5766
5767 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5768 if (Known0.isUnknown())
5769 break;
5770
5771 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5772 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5773 Known.One = Known0.One & Known1.One & Known2.One;
5774 break;
5775 }
5776 case ISD::INTRINSIC_WO_CHAIN: {
5777 unsigned IID = Op.getConstantOperandVal(0);
5778 switch (IID) {
5779 case Intrinsic::amdgcn_workitem_id_x:
5780 case Intrinsic::amdgcn_workitem_id_y:
5781 case Intrinsic::amdgcn_workitem_id_z: {
5782 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5783 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
5784 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5785 break;
5786 }
5787 default:
5788 break;
5789 }
5790 }
5791 }
5792 }
5793
ComputeNumSignBitsForTargetNode(SDValue Op,const APInt & DemandedElts,const SelectionDAG & DAG,unsigned Depth) const5794 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5795 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5796 unsigned Depth) const {
5797 switch (Op.getOpcode()) {
5798 case AMDGPUISD::BFE_I32: {
5799 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5800 if (!Width)
5801 return 1;
5802
5803 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5804 if (!isNullConstant(Op.getOperand(1)))
5805 return SignBits;
5806
5807 // TODO: Could probably figure something out with non-0 offsets.
5808 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5809 return std::max(SignBits, Op0SignBits);
5810 }
5811
5812 case AMDGPUISD::BFE_U32: {
5813 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5814 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5815 }
5816
5817 case AMDGPUISD::CARRY:
5818 case AMDGPUISD::BORROW:
5819 return 31;
5820 case AMDGPUISD::BUFFER_LOAD_BYTE:
5821 return 25;
5822 case AMDGPUISD::BUFFER_LOAD_SHORT:
5823 return 17;
5824 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5825 return 24;
5826 case AMDGPUISD::BUFFER_LOAD_USHORT:
5827 return 16;
5828 case AMDGPUISD::FP_TO_FP16:
5829 return 16;
5830 case AMDGPUISD::SMIN3:
5831 case AMDGPUISD::SMAX3:
5832 case AMDGPUISD::SMED3:
5833 case AMDGPUISD::UMIN3:
5834 case AMDGPUISD::UMAX3:
5835 case AMDGPUISD::UMED3: {
5836 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5837 if (Tmp2 == 1)
5838 return 1; // Early out.
5839
5840 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5841 if (Tmp1 == 1)
5842 return 1; // Early out.
5843
5844 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5845 if (Tmp0 == 1)
5846 return 1; // Early out.
5847
5848 return std::min({Tmp0, Tmp1, Tmp2});
5849 }
5850 default:
5851 return 1;
5852 }
5853 }
5854
computeNumSignBitsForTargetInstr(GISelKnownBits & Analysis,Register R,const APInt & DemandedElts,const MachineRegisterInfo & MRI,unsigned Depth) const5855 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5856 GISelKnownBits &Analysis, Register R,
5857 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5858 unsigned Depth) const {
5859 const MachineInstr *MI = MRI.getVRegDef(R);
5860 if (!MI)
5861 return 1;
5862
5863 // TODO: Check range metadata on MMO.
5864 switch (MI->getOpcode()) {
5865 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5866 return 25;
5867 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5868 return 17;
5869 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5870 return 24;
5871 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5872 return 16;
5873 case AMDGPU::G_AMDGPU_SMED3:
5874 case AMDGPU::G_AMDGPU_UMED3: {
5875 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5876 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5877 if (Tmp2 == 1)
5878 return 1;
5879 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5880 if (Tmp1 == 1)
5881 return 1;
5882 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5883 if (Tmp0 == 1)
5884 return 1;
5885 return std::min({Tmp0, Tmp1, Tmp2});
5886 }
5887 default:
5888 return 1;
5889 }
5890 }
5891
isKnownNeverNaNForTargetNode(SDValue Op,const SelectionDAG & DAG,bool SNaN,unsigned Depth) const5892 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5893 const SelectionDAG &DAG,
5894 bool SNaN,
5895 unsigned Depth) const {
5896 unsigned Opcode = Op.getOpcode();
5897 switch (Opcode) {
5898 case AMDGPUISD::FMIN_LEGACY:
5899 case AMDGPUISD::FMAX_LEGACY: {
5900 if (SNaN)
5901 return true;
5902
5903 // TODO: Can check no nans on one of the operands for each one, but which
5904 // one?
5905 return false;
5906 }
5907 case AMDGPUISD::FMUL_LEGACY:
5908 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5909 if (SNaN)
5910 return true;
5911 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5912 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5913 }
5914 case AMDGPUISD::FMED3:
5915 case AMDGPUISD::FMIN3:
5916 case AMDGPUISD::FMAX3:
5917 case AMDGPUISD::FMINIMUM3:
5918 case AMDGPUISD::FMAXIMUM3:
5919 case AMDGPUISD::FMAD_FTZ: {
5920 if (SNaN)
5921 return true;
5922 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5923 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5924 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5925 }
5926 case AMDGPUISD::CVT_F32_UBYTE0:
5927 case AMDGPUISD::CVT_F32_UBYTE1:
5928 case AMDGPUISD::CVT_F32_UBYTE2:
5929 case AMDGPUISD::CVT_F32_UBYTE3:
5930 return true;
5931
5932 case AMDGPUISD::RCP:
5933 case AMDGPUISD::RSQ:
5934 case AMDGPUISD::RCP_LEGACY:
5935 case AMDGPUISD::RSQ_CLAMP: {
5936 if (SNaN)
5937 return true;
5938
5939 // TODO: Need is known positive check.
5940 return false;
5941 }
5942 case ISD::FLDEXP:
5943 case AMDGPUISD::FRACT: {
5944 if (SNaN)
5945 return true;
5946 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5947 }
5948 case AMDGPUISD::DIV_SCALE:
5949 case AMDGPUISD::DIV_FMAS:
5950 case AMDGPUISD::DIV_FIXUP:
5951 // TODO: Refine on operands.
5952 return SNaN;
5953 case AMDGPUISD::SIN_HW:
5954 case AMDGPUISD::COS_HW: {
5955 // TODO: Need check for infinity
5956 return SNaN;
5957 }
5958 case ISD::INTRINSIC_WO_CHAIN: {
5959 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5960 // TODO: Handle more intrinsics
5961 switch (IntrinsicID) {
5962 case Intrinsic::amdgcn_cubeid:
5963 return true;
5964
5965 case Intrinsic::amdgcn_frexp_mant: {
5966 if (SNaN)
5967 return true;
5968 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5969 }
5970 case Intrinsic::amdgcn_cvt_pkrtz: {
5971 if (SNaN)
5972 return true;
5973 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5974 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5975 }
5976 case Intrinsic::amdgcn_rcp:
5977 case Intrinsic::amdgcn_rsq:
5978 case Intrinsic::amdgcn_rcp_legacy:
5979 case Intrinsic::amdgcn_rsq_legacy:
5980 case Intrinsic::amdgcn_rsq_clamp: {
5981 if (SNaN)
5982 return true;
5983
5984 // TODO: Need is known positive check.
5985 return false;
5986 }
5987 case Intrinsic::amdgcn_trig_preop:
5988 case Intrinsic::amdgcn_fdot2:
5989 // TODO: Refine on operand
5990 return SNaN;
5991 case Intrinsic::amdgcn_fma_legacy:
5992 if (SNaN)
5993 return true;
5994 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5995 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5996 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5997 default:
5998 return false;
5999 }
6000 }
6001 default:
6002 return false;
6003 }
6004 }
6005
isReassocProfitable(MachineRegisterInfo & MRI,Register N0,Register N1) const6006 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6007 Register N0, Register N1) const {
6008 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6009 }
6010
6011 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * RMW) const6012 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
6013 switch (RMW->getOperation()) {
6014 case AtomicRMWInst::Nand:
6015 case AtomicRMWInst::FAdd:
6016 case AtomicRMWInst::FSub:
6017 case AtomicRMWInst::FMax:
6018 case AtomicRMWInst::FMin:
6019 return AtomicExpansionKind::CmpXChg;
6020 case AtomicRMWInst::Xchg: {
6021 const DataLayout &DL = RMW->getFunction()->getDataLayout();
6022 unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
6023 if (ValSize == 32 || ValSize == 64)
6024 return AtomicExpansionKind::None;
6025 return AtomicExpansionKind::CmpXChg;
6026 }
6027 default: {
6028 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
6029 unsigned Size = IntTy->getBitWidth();
6030 if (Size == 32 || Size == 64)
6031 return AtomicExpansionKind::None;
6032 }
6033
6034 return AtomicExpansionKind::CmpXChg;
6035 }
6036 }
6037 }
6038
6039 /// Whether it is profitable to sink the operands of an
6040 /// Instruction I to the basic block of I.
6041 /// This helps using several modifiers (like abs and neg) more often.
shouldSinkOperands(Instruction * I,SmallVectorImpl<Use * > & Ops) const6042 bool AMDGPUTargetLowering::shouldSinkOperands(
6043 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6044 using namespace PatternMatch;
6045
6046 for (auto &Op : I->operands()) {
6047 // Ensure we are not already sinking this operand.
6048 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6049 continue;
6050
6051 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6052 Ops.push_back(&Op);
6053 }
6054
6055 return !Ops.empty();
6056 }
6057