xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/KnownBits.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 using namespace llvm;
30 
31 #include "AMDGPUGenCallingConv.inc"
32 
33 static cl::opt<bool> AMDGPUBypassSlowDiv(
34   "amdgpu-bypass-slow-div",
35   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36   cl::init(true));
37 
38 // Find a larger type to do a load / store of a vector with.
39 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40   unsigned StoreSize = VT.getStoreSizeInBits();
41   if (StoreSize <= 32)
42     return EVT::getIntegerVT(Ctx, StoreSize);
43 
44   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46 }
47 
48 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49   return DAG.computeKnownBits(Op).countMaxActiveBits();
50 }
51 
52 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53   // In order for this to be a signed 24-bit value, bit 23, must
54   // be a sign bit.
55   return DAG.ComputeMaxSignificantBits(Op);
56 }
57 
58 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59                                            const AMDGPUSubtarget &STI)
60     : TargetLowering(TM), Subtarget(&STI) {
61   // Lower floating point store/load to integer store/load to reduce the number
62   // of patterns in tablegen.
63   setOperationAction(ISD::LOAD, MVT::f32, Promote);
64   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65 
66   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68 
69   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71 
72   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74 
75   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77 
78   setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79   AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80 
81   setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82   AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83 
84   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86 
87   setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
88   AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
89 
90   setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
91   AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
92 
93   setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
94   AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
95 
96   setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
97   AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
98 
99   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
100   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
101 
102   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
103   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
104 
105   setOperationAction(ISD::LOAD, MVT::i64, Promote);
106   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
107 
108   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
109   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
110 
111   setOperationAction(ISD::LOAD, MVT::f64, Promote);
112   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
113 
114   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
115   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
116 
117   setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
118   AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
119 
120   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
121   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
122 
123   setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
124   AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
125 
126   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
127   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
128 
129   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
130   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
131 
132   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
133   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
134 
135   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
136   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
137 
138   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
139   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
140 
141   // There are no 64-bit extloads. These should be done as a 32-bit extload and
142   // an extension to 64-bit.
143   for (MVT VT : MVT::integer_valuetypes())
144     setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
145                      Expand);
146 
147   for (MVT VT : MVT::integer_valuetypes()) {
148     if (VT == MVT::i64)
149       continue;
150 
151     for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
152       setLoadExtAction(Op, VT, MVT::i1, Promote);
153       setLoadExtAction(Op, VT, MVT::i8, Legal);
154       setLoadExtAction(Op, VT, MVT::i16, Legal);
155       setLoadExtAction(Op, VT, MVT::i32, Expand);
156     }
157   }
158 
159   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
160     for (auto MemVT :
161          {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
162       setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
163                        Expand);
164 
165   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
166   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
167   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
168   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
169   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
170   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
171   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
172   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
173 
174   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
175   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
176   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
177   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
178   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
179   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
180 
181   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
182   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
183   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
184   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
185   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
186   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
187   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
188 
189   setOperationAction(ISD::STORE, MVT::f32, Promote);
190   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
191 
192   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
193   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
194 
195   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
196   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
197 
198   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
199   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
200 
201   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
202   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
203 
204   setOperationAction(ISD::STORE, MVT::v6f32, Promote);
205   AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
206 
207   setOperationAction(ISD::STORE, MVT::v7f32, Promote);
208   AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
209 
210   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
211   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
212 
213   setOperationAction(ISD::STORE, MVT::v9f32, Promote);
214   AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
215 
216   setOperationAction(ISD::STORE, MVT::v10f32, Promote);
217   AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
218 
219   setOperationAction(ISD::STORE, MVT::v11f32, Promote);
220   AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
221 
222   setOperationAction(ISD::STORE, MVT::v12f32, Promote);
223   AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
224 
225   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
226   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
227 
228   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
229   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
230 
231   setOperationAction(ISD::STORE, MVT::i64, Promote);
232   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
233 
234   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
235   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
236 
237   setOperationAction(ISD::STORE, MVT::f64, Promote);
238   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
239 
240   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
241   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
242 
243   setOperationAction(ISD::STORE, MVT::v3i64, Promote);
244   AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
245 
246   setOperationAction(ISD::STORE, MVT::v3f64, Promote);
247   AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
248 
249   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
250   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
251 
252   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
253   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
254 
255   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
256   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
257 
258   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
259   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
260 
261   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
262   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
263 
264   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
265   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
266 
267   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
268   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
269   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
270   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
271 
272   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
273   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
274   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
275   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
276 
277   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
278   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
279   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
280   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
281   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
282   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
283   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
284   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
285 
286   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
287   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
288   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
289 
290   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
291   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
292 
293   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
294   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
295   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
296   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
297 
298   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
299   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
300   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
301   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
302 
303   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
304   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
305 
306   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
307   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
308   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
309   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
310   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
311   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
312   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
313 
314   setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
315   setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
316 
317   setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
318 
319   // This is totally unsupported, just custom lower to produce an error.
320   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
321 
322   // Library functions.  These default to Expand, but we have instructions
323   // for them.
324   setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
325                       ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
326                       ISD::FMAXNUM},
327                      MVT::f32, Legal);
328 
329   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
330 
331   setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
332 
333   setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
334 
335   setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
336 
337   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
338 
339   if (Subtarget->has16BitInsts())
340     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
341   else
342     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
343 
344   // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
345   // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
346   // default unless marked custom/legal.
347   setOperationAction(
348       ISD::IS_FPCLASS,
349       {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
350        MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
351        MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
352       Custom);
353 
354   // Expand to fneg + fadd.
355   setOperationAction(ISD::FSUB, MVT::f64, Expand);
356 
357   setOperationAction(ISD::CONCAT_VECTORS,
358                      {MVT::v3i32,  MVT::v3f32,  MVT::v4i32,  MVT::v4f32,
359                       MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
360                       MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
361                       MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
362                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
363                      Custom);
364   setOperationAction(
365       ISD::EXTRACT_SUBVECTOR,
366       {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
367        MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
368        MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
369        MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32,
370        MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
371        MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
372        MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
373        MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
374        MVT::v16f64, MVT::v16i64},
375       Custom);
376 
377   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
378   setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
379 
380   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
381   for (MVT VT : ScalarIntVTs) {
382     // These should use [SU]DIVREM, so set them to expand
383     setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
384                        Expand);
385 
386     // GPU does not have divrem function for signed or unsigned.
387     setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
388 
389     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
390     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
391 
392     setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
393 
394     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
395     setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
396   }
397 
398   // The hardware supports 32-bit FSHR, but not FSHL.
399   setOperationAction(ISD::FSHR, MVT::i32, Legal);
400 
401   // The hardware supports 32-bit ROTR, but not ROTL.
402   setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
403   setOperationAction(ISD::ROTR, MVT::i64, Expand);
404 
405   setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
406 
407   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
408   setOperationAction(
409       {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
410       MVT::i64, Custom);
411   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
412 
413   setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
414                      Legal);
415 
416   setOperationAction(
417       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
418       MVT::i64, Custom);
419 
420   static const MVT::SimpleValueType VectorIntTypes[] = {
421       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
422       MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
423 
424   for (MVT VT : VectorIntTypes) {
425     // Expand the following operations for the current type by default.
426     setOperationAction({ISD::ADD,        ISD::AND,     ISD::FP_TO_SINT,
427                         ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
428                         ISD::MULHS,      ISD::OR,      ISD::SHL,
429                         ISD::SRA,        ISD::SRL,     ISD::ROTL,
430                         ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
431                         ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
432                         ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
433                         ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
434                         ISD::SELECT,     ISD::VSELECT, ISD::SELECT_CC,
435                         ISD::XOR,        ISD::BSWAP,   ISD::CTPOP,
436                         ISD::CTTZ,       ISD::CTLZ,    ISD::VECTOR_SHUFFLE,
437                         ISD::SETCC},
438                        VT, Expand);
439   }
440 
441   static const MVT::SimpleValueType FloatVectorTypes[] = {
442       MVT::v2f32, MVT::v3f32,  MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
443       MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
444 
445   for (MVT VT : FloatVectorTypes) {
446     setOperationAction(
447         {ISD::FABS,    ISD::FMINNUM,      ISD::FMAXNUM,   ISD::FADD,
448          ISD::FCEIL,   ISD::FCOS,         ISD::FDIV,      ISD::FEXP2,
449          ISD::FEXP,    ISD::FLOG2,        ISD::FREM,      ISD::FLOG,
450          ISD::FLOG10,  ISD::FPOW,         ISD::FFLOOR,    ISD::FTRUNC,
451          ISD::FMUL,    ISD::FMA,          ISD::FRINT,     ISD::FNEARBYINT,
452          ISD::FSQRT,   ISD::FSIN,         ISD::FSUB,      ISD::FNEG,
453          ISD::VSELECT, ISD::SELECT_CC,    ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
454          ISD::SETCC,   ISD::FCANONICALIZE},
455         VT, Expand);
456   }
457 
458   // This causes using an unrolled select operation rather than expansion with
459   // bit operations. This is in general better, but the alternative using BFI
460   // instructions may be better if the select sources are SGPRs.
461   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
462   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
463 
464   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
465   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
466 
467   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
468   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
469 
470   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
471   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
472 
473   setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
474   AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
475 
476   setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
477   AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
478 
479   setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
480   AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
481 
482   setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
483   AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
484 
485   setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
486   AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
487 
488   setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
489   AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
490 
491   // There are no libcalls of any kind.
492   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
493     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
494 
495   setSchedulingPreference(Sched::RegPressure);
496   setJumpIsExpensive(true);
497 
498   // FIXME: This is only partially true. If we have to do vector compares, any
499   // SGPR pair can be a condition register. If we have a uniform condition, we
500   // are better off doing SALU operations, where there is only one SCC. For now,
501   // we don't have a way of knowing during instruction selection if a condition
502   // will be uniform and we always use vector compares. Assume we are using
503   // vector compares until that is fixed.
504   setHasMultipleConditionRegisters(true);
505 
506   setMinCmpXchgSizeInBits(32);
507   setSupportsUnalignedAtomics(false);
508 
509   PredictableSelectIsExpensive = false;
510 
511   // We want to find all load dependencies for long chains of stores to enable
512   // merging into very wide vectors. The problem is with vectors with > 4
513   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
514   // vectors are a legal type, even though we have to split the loads
515   // usually. When we can more precisely specify load legality per address
516   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
517   // smarter so that they can figure out what to do in 2 iterations without all
518   // N > 4 stores on the same chain.
519   GatherAllAliasesMaxDepth = 16;
520 
521   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
522   // about these during lowering.
523   MaxStoresPerMemcpy  = 0xffffffff;
524   MaxStoresPerMemmove = 0xffffffff;
525   MaxStoresPerMemset  = 0xffffffff;
526 
527   // The expansion for 64-bit division is enormous.
528   if (AMDGPUBypassSlowDiv)
529     addBypassSlowDiv(64, 32);
530 
531   setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,
532                        ISD::SRA,        ISD::SRL,
533                        ISD::TRUNCATE,   ISD::MUL,
534                        ISD::SMUL_LOHI,  ISD::UMUL_LOHI,
535                        ISD::MULHU,      ISD::MULHS,
536                        ISD::SELECT,     ISD::SELECT_CC,
537                        ISD::STORE,      ISD::FADD,
538                        ISD::FSUB,       ISD::FNEG,
539                        ISD::FABS,       ISD::AssertZext,
540                        ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
541 }
542 
543 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
544   if (getTargetMachine().Options.NoSignedZerosFPMath)
545     return true;
546 
547   const auto Flags = Op.getNode()->getFlags();
548   if (Flags.hasNoSignedZeros())
549     return true;
550 
551   return false;
552 }
553 
554 //===----------------------------------------------------------------------===//
555 // Target Information
556 //===----------------------------------------------------------------------===//
557 
558 LLVM_READNONE
559 static bool fnegFoldsIntoOp(unsigned Opc) {
560   switch (Opc) {
561   case ISD::FADD:
562   case ISD::FSUB:
563   case ISD::FMUL:
564   case ISD::FMA:
565   case ISD::FMAD:
566   case ISD::FMINNUM:
567   case ISD::FMAXNUM:
568   case ISD::FMINNUM_IEEE:
569   case ISD::FMAXNUM_IEEE:
570   case ISD::FSIN:
571   case ISD::FTRUNC:
572   case ISD::FRINT:
573   case ISD::FNEARBYINT:
574   case ISD::FCANONICALIZE:
575   case AMDGPUISD::RCP:
576   case AMDGPUISD::RCP_LEGACY:
577   case AMDGPUISD::RCP_IFLAG:
578   case AMDGPUISD::SIN_HW:
579   case AMDGPUISD::FMUL_LEGACY:
580   case AMDGPUISD::FMIN_LEGACY:
581   case AMDGPUISD::FMAX_LEGACY:
582   case AMDGPUISD::FMED3:
583     // TODO: handle llvm.amdgcn.fma.legacy
584     return true;
585   default:
586     return false;
587   }
588 }
589 
590 /// \p returns true if the operation will definitely need to use a 64-bit
591 /// encoding, and thus will use a VOP3 encoding regardless of the source
592 /// modifiers.
593 LLVM_READONLY
594 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
595   return N->getNumOperands() > 2 || VT == MVT::f64;
596 }
597 
598 // Most FP instructions support source modifiers, but this could be refined
599 // slightly.
600 LLVM_READONLY
601 static bool hasSourceMods(const SDNode *N) {
602   if (isa<MemSDNode>(N))
603     return false;
604 
605   switch (N->getOpcode()) {
606   case ISD::CopyToReg:
607   case ISD::SELECT:
608   case ISD::FDIV:
609   case ISD::FREM:
610   case ISD::INLINEASM:
611   case ISD::INLINEASM_BR:
612   case AMDGPUISD::DIV_SCALE:
613   case ISD::INTRINSIC_W_CHAIN:
614 
615   // TODO: Should really be looking at the users of the bitcast. These are
616   // problematic because bitcasts are used to legalize all stores to integer
617   // types.
618   case ISD::BITCAST:
619     return false;
620   case ISD::INTRINSIC_WO_CHAIN: {
621     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
622     case Intrinsic::amdgcn_interp_p1:
623     case Intrinsic::amdgcn_interp_p2:
624     case Intrinsic::amdgcn_interp_mov:
625     case Intrinsic::amdgcn_interp_p1_f16:
626     case Intrinsic::amdgcn_interp_p2_f16:
627       return false;
628     default:
629       return true;
630     }
631   }
632   default:
633     return true;
634   }
635 }
636 
637 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
638                                                  unsigned CostThreshold) {
639   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
640   // it is truly free to use a source modifier in all cases. If there are
641   // multiple users but for each one will necessitate using VOP3, there will be
642   // a code size increase. Try to avoid increasing code size unless we know it
643   // will save on the instruction count.
644   unsigned NumMayIncreaseSize = 0;
645   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
646 
647   // XXX - Should this limit number of uses to check?
648   for (const SDNode *U : N->uses()) {
649     if (!hasSourceMods(U))
650       return false;
651 
652     if (!opMustUseVOP3Encoding(U, VT)) {
653       if (++NumMayIncreaseSize > CostThreshold)
654         return false;
655     }
656   }
657 
658   return true;
659 }
660 
661 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
662                                               ISD::NodeType ExtendKind) const {
663   assert(!VT.isVector() && "only scalar expected");
664 
665   // Round to the next multiple of 32-bits.
666   unsigned Size = VT.getSizeInBits();
667   if (Size <= 32)
668     return MVT::i32;
669   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
670 }
671 
672 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
673   return MVT::i32;
674 }
675 
676 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
677   return true;
678 }
679 
680 // The backend supports 32 and 64 bit floating point immediates.
681 // FIXME: Why are we reporting vectors of FP immediates as legal?
682 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
683                                         bool ForCodeSize) const {
684   EVT ScalarVT = VT.getScalarType();
685   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
686          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
687 }
688 
689 // We don't want to shrink f64 / f32 constants.
690 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
691   EVT ScalarVT = VT.getScalarType();
692   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
693 }
694 
695 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
696                                                  ISD::LoadExtType ExtTy,
697                                                  EVT NewVT) const {
698   // TODO: This may be worth removing. Check regression tests for diffs.
699   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
700     return false;
701 
702   unsigned NewSize = NewVT.getStoreSizeInBits();
703 
704   // If we are reducing to a 32-bit load or a smaller multi-dword load,
705   // this is always better.
706   if (NewSize >= 32)
707     return true;
708 
709   EVT OldVT = N->getValueType(0);
710   unsigned OldSize = OldVT.getStoreSizeInBits();
711 
712   MemSDNode *MN = cast<MemSDNode>(N);
713   unsigned AS = MN->getAddressSpace();
714   // Do not shrink an aligned scalar load to sub-dword.
715   // Scalar engine cannot do sub-dword loads.
716   if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
717       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
718        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
719        (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
720         MN->isInvariant())) &&
721       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
722     return false;
723 
724   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
725   // extloads, so doing one requires using a buffer_load. In cases where we
726   // still couldn't use a scalar load, using the wider load shouldn't really
727   // hurt anything.
728 
729   // If the old size already had to be an extload, there's no harm in continuing
730   // to reduce the width.
731   return (OldSize < 32);
732 }
733 
734 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
735                                                    const SelectionDAG &DAG,
736                                                    const MachineMemOperand &MMO) const {
737 
738   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
739 
740   if (LoadTy.getScalarType() == MVT::i32)
741     return false;
742 
743   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
744   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
745 
746   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
747     return false;
748 
749   unsigned Fast = 0;
750   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
751                                         CastTy, MMO, &Fast) &&
752          Fast;
753 }
754 
755 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
756 // profitable with the expansion for 64-bit since it's generally good to
757 // speculate things.
758 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
759   return true;
760 }
761 
762 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
763   return true;
764 }
765 
766 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
767   switch (N->getOpcode()) {
768   case ISD::EntryToken:
769   case ISD::TokenFactor:
770     return true;
771   case ISD::INTRINSIC_WO_CHAIN: {
772     unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
773     switch (IntrID) {
774     case Intrinsic::amdgcn_readfirstlane:
775     case Intrinsic::amdgcn_readlane:
776       return true;
777     }
778     return false;
779   }
780   case ISD::LOAD:
781     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
782         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
783       return true;
784     return false;
785   case AMDGPUISD::SETCC: // ballot-style instruction
786     return true;
787   }
788   return false;
789 }
790 
791 SDValue AMDGPUTargetLowering::getNegatedExpression(
792     SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
793     NegatibleCost &Cost, unsigned Depth) const {
794 
795   switch (Op.getOpcode()) {
796   case ISD::FMA:
797   case ISD::FMAD: {
798     // Negating a fma is not free if it has users without source mods.
799     if (!allUsesHaveSourceMods(Op.getNode()))
800       return SDValue();
801     break;
802   }
803   default:
804     break;
805   }
806 
807   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
808                                               ForCodeSize, Cost, Depth);
809 }
810 
811 //===---------------------------------------------------------------------===//
812 // Target Properties
813 //===---------------------------------------------------------------------===//
814 
815 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
816   assert(VT.isFloatingPoint());
817 
818   // Packed operations do not have a fabs modifier.
819   return VT == MVT::f32 || VT == MVT::f64 ||
820          (Subtarget->has16BitInsts() && VT == MVT::f16);
821 }
822 
823 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
824   assert(VT.isFloatingPoint());
825   // Report this based on the end legalized type.
826   VT = VT.getScalarType();
827   return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
828 }
829 
830 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
831                                                          unsigned NumElem,
832                                                          unsigned AS) const {
833   return true;
834 }
835 
836 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
837   // There are few operations which truly have vector input operands. Any vector
838   // operation is going to involve operations on each component, and a
839   // build_vector will be a copy per element, so it always makes sense to use a
840   // build_vector input in place of the extracted element to avoid a copy into a
841   // super register.
842   //
843   // We should probably only do this if all users are extracts only, but this
844   // should be the common case.
845   return true;
846 }
847 
848 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
849   // Truncate is just accessing a subregister.
850 
851   unsigned SrcSize = Source.getSizeInBits();
852   unsigned DestSize = Dest.getSizeInBits();
853 
854   return DestSize < SrcSize && DestSize % 32 == 0 ;
855 }
856 
857 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
858   // Truncate is just accessing a subregister.
859 
860   unsigned SrcSize = Source->getScalarSizeInBits();
861   unsigned DestSize = Dest->getScalarSizeInBits();
862 
863   if (DestSize== 16 && Subtarget->has16BitInsts())
864     return SrcSize >= 32;
865 
866   return DestSize < SrcSize && DestSize % 32 == 0;
867 }
868 
869 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
870   unsigned SrcSize = Src->getScalarSizeInBits();
871   unsigned DestSize = Dest->getScalarSizeInBits();
872 
873   if (SrcSize == 16 && Subtarget->has16BitInsts())
874     return DestSize >= 32;
875 
876   return SrcSize == 32 && DestSize == 64;
877 }
878 
879 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
880   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
881   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
882   // this will enable reducing 64-bit operations the 32-bit, which is always
883   // good.
884 
885   if (Src == MVT::i16)
886     return Dest == MVT::i32 ||Dest == MVT::i64 ;
887 
888   return Src == MVT::i32 && Dest == MVT::i64;
889 }
890 
891 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
892   return isZExtFree(Val.getValueType(), VT2);
893 }
894 
895 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
896   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
897   // limited number of native 64-bit operations. Shrinking an operation to fit
898   // in a single 32-bit register should always be helpful. As currently used,
899   // this is much less general than the name suggests, and is only used in
900   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
901   // not profitable, and may actually be harmful.
902   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
903 }
904 
905 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
906     const SDNode* N, CombineLevel Level) const {
907   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
908           N->getOpcode() == ISD::SRL) &&
909          "Expected shift op");
910   // Always commute pre-type legalization and right shifts.
911   // We're looking for shl(or(x,y),z) patterns.
912   if (Level < CombineLevel::AfterLegalizeTypes ||
913       N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
914     return true;
915 
916   // If only user is a i32 right-shift, then don't destroy a BFE pattern.
917   if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
918       (N->use_begin()->getOpcode() == ISD::SRA ||
919        N->use_begin()->getOpcode() == ISD::SRL))
920     return false;
921 
922   // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
923   auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
924     if (LHS.getOpcode() != ISD::SHL)
925       return false;
926     auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
927     auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
928     auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
929     return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
930            LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
931            RHSLd->getExtensionType() == ISD::ZEXTLOAD;
932   };
933   SDValue LHS = N->getOperand(0).getOperand(0);
934   SDValue RHS = N->getOperand(0).getOperand(1);
935   return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
936 }
937 
938 //===---------------------------------------------------------------------===//
939 // TargetLowering Callbacks
940 //===---------------------------------------------------------------------===//
941 
942 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
943                                                   bool IsVarArg) {
944   switch (CC) {
945   case CallingConv::AMDGPU_VS:
946   case CallingConv::AMDGPU_GS:
947   case CallingConv::AMDGPU_PS:
948   case CallingConv::AMDGPU_CS:
949   case CallingConv::AMDGPU_HS:
950   case CallingConv::AMDGPU_ES:
951   case CallingConv::AMDGPU_LS:
952     return CC_AMDGPU;
953   case CallingConv::C:
954   case CallingConv::Fast:
955   case CallingConv::Cold:
956     return CC_AMDGPU_Func;
957   case CallingConv::AMDGPU_Gfx:
958     return CC_SI_Gfx;
959   case CallingConv::AMDGPU_KERNEL:
960   case CallingConv::SPIR_KERNEL:
961   default:
962     report_fatal_error("Unsupported calling convention for call");
963   }
964 }
965 
966 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
967                                                     bool IsVarArg) {
968   switch (CC) {
969   case CallingConv::AMDGPU_KERNEL:
970   case CallingConv::SPIR_KERNEL:
971     llvm_unreachable("kernels should not be handled here");
972   case CallingConv::AMDGPU_VS:
973   case CallingConv::AMDGPU_GS:
974   case CallingConv::AMDGPU_PS:
975   case CallingConv::AMDGPU_CS:
976   case CallingConv::AMDGPU_HS:
977   case CallingConv::AMDGPU_ES:
978   case CallingConv::AMDGPU_LS:
979     return RetCC_SI_Shader;
980   case CallingConv::AMDGPU_Gfx:
981     return RetCC_SI_Gfx;
982   case CallingConv::C:
983   case CallingConv::Fast:
984   case CallingConv::Cold:
985     return RetCC_AMDGPU_Func;
986   default:
987     report_fatal_error("Unsupported calling convention.");
988   }
989 }
990 
991 /// The SelectionDAGBuilder will automatically promote function arguments
992 /// with illegal types.  However, this does not work for the AMDGPU targets
993 /// since the function arguments are stored in memory as these illegal types.
994 /// In order to handle this properly we need to get the original types sizes
995 /// from the LLVM IR Function and fixup the ISD:InputArg values before
996 /// passing them to AnalyzeFormalArguments()
997 
998 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
999 /// input values across multiple registers.  Each item in the Ins array
1000 /// represents a single value that will be stored in registers.  Ins[x].VT is
1001 /// the value type of the value that will be stored in the register, so
1002 /// whatever SDNode we lower the argument to needs to be this type.
1003 ///
1004 /// In order to correctly lower the arguments we need to know the size of each
1005 /// argument.  Since Ins[x].VT gives us the size of the register that will
1006 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1007 /// for the original function argument so that we can deduce the correct memory
1008 /// type to use for Ins[x].  In most cases the correct memory type will be
1009 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
1010 /// we have a kernel argument of type v8i8, this argument will be split into
1011 /// 8 parts and each part will be represented by its own item in the Ins array.
1012 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1013 /// the argument before it was split.  From this, we deduce that the memory type
1014 /// for each individual part is i8.  We pass the memory type as LocVT to the
1015 /// calling convention analysis function and the register type (Ins[x].VT) as
1016 /// the ValVT.
1017 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1018   CCState &State,
1019   const SmallVectorImpl<ISD::InputArg> &Ins) const {
1020   const MachineFunction &MF = State.getMachineFunction();
1021   const Function &Fn = MF.getFunction();
1022   LLVMContext &Ctx = Fn.getParent()->getContext();
1023   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1024   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1025   CallingConv::ID CC = Fn.getCallingConv();
1026 
1027   Align MaxAlign = Align(1);
1028   uint64_t ExplicitArgOffset = 0;
1029   const DataLayout &DL = Fn.getParent()->getDataLayout();
1030 
1031   unsigned InIndex = 0;
1032 
1033   for (const Argument &Arg : Fn.args()) {
1034     const bool IsByRef = Arg.hasByRefAttr();
1035     Type *BaseArgTy = Arg.getType();
1036     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1037     Align Alignment = DL.getValueOrABITypeAlignment(
1038         IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1039     MaxAlign = std::max(Alignment, MaxAlign);
1040     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1041 
1042     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1043     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1044 
1045     // We're basically throwing away everything passed into us and starting over
1046     // to get accurate in-memory offsets. The "PartOffset" is completely useless
1047     // to us as computed in Ins.
1048     //
1049     // We also need to figure out what type legalization is trying to do to get
1050     // the correct memory offsets.
1051 
1052     SmallVector<EVT, 16> ValueVTs;
1053     SmallVector<uint64_t, 16> Offsets;
1054     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1055 
1056     for (unsigned Value = 0, NumValues = ValueVTs.size();
1057          Value != NumValues; ++Value) {
1058       uint64_t BasePartOffset = Offsets[Value];
1059 
1060       EVT ArgVT = ValueVTs[Value];
1061       EVT MemVT = ArgVT;
1062       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1063       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1064 
1065       if (NumRegs == 1) {
1066         // This argument is not split, so the IR type is the memory type.
1067         if (ArgVT.isExtended()) {
1068           // We have an extended type, like i24, so we should just use the
1069           // register type.
1070           MemVT = RegisterVT;
1071         } else {
1072           MemVT = ArgVT;
1073         }
1074       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1075                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1076         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1077         // We have a vector value which has been split into a vector with
1078         // the same scalar type, but fewer elements.  This should handle
1079         // all the floating-point vector types.
1080         MemVT = RegisterVT;
1081       } else if (ArgVT.isVector() &&
1082                  ArgVT.getVectorNumElements() == NumRegs) {
1083         // This arg has been split so that each element is stored in a separate
1084         // register.
1085         MemVT = ArgVT.getScalarType();
1086       } else if (ArgVT.isExtended()) {
1087         // We have an extended type, like i65.
1088         MemVT = RegisterVT;
1089       } else {
1090         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1091         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1092         if (RegisterVT.isInteger()) {
1093           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1094         } else if (RegisterVT.isVector()) {
1095           assert(!RegisterVT.getScalarType().isFloatingPoint());
1096           unsigned NumElements = RegisterVT.getVectorNumElements();
1097           assert(MemoryBits % NumElements == 0);
1098           // This vector type has been split into another vector type with
1099           // a different elements size.
1100           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1101                                            MemoryBits / NumElements);
1102           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1103         } else {
1104           llvm_unreachable("cannot deduce memory type.");
1105         }
1106       }
1107 
1108       // Convert one element vectors to scalar.
1109       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1110         MemVT = MemVT.getScalarType();
1111 
1112       // Round up vec3/vec5 argument.
1113       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1114         assert(MemVT.getVectorNumElements() == 3 ||
1115                MemVT.getVectorNumElements() == 5 ||
1116                (MemVT.getVectorNumElements() >= 9 &&
1117                 MemVT.getVectorNumElements() <= 12));
1118         MemVT = MemVT.getPow2VectorType(State.getContext());
1119       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1120         MemVT = MemVT.getRoundIntegerType(State.getContext());
1121       }
1122 
1123       unsigned PartOffset = 0;
1124       for (unsigned i = 0; i != NumRegs; ++i) {
1125         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1126                                                BasePartOffset + PartOffset,
1127                                                MemVT.getSimpleVT(),
1128                                                CCValAssign::Full));
1129         PartOffset += MemVT.getStoreSize();
1130       }
1131     }
1132   }
1133 }
1134 
1135 SDValue AMDGPUTargetLowering::LowerReturn(
1136   SDValue Chain, CallingConv::ID CallConv,
1137   bool isVarArg,
1138   const SmallVectorImpl<ISD::OutputArg> &Outs,
1139   const SmallVectorImpl<SDValue> &OutVals,
1140   const SDLoc &DL, SelectionDAG &DAG) const {
1141   // FIXME: Fails for r600 tests
1142   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1143   // "wave terminate should not have return values");
1144   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1145 }
1146 
1147 //===---------------------------------------------------------------------===//
1148 // Target specific lowering
1149 //===---------------------------------------------------------------------===//
1150 
1151 /// Selects the correct CCAssignFn for a given CallingConvention value.
1152 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1153                                                     bool IsVarArg) {
1154   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1155 }
1156 
1157 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1158                                                       bool IsVarArg) {
1159   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1160 }
1161 
1162 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1163                                                   SelectionDAG &DAG,
1164                                                   MachineFrameInfo &MFI,
1165                                                   int ClobberedFI) const {
1166   SmallVector<SDValue, 8> ArgChains;
1167   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1168   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1169 
1170   // Include the original chain at the beginning of the list. When this is
1171   // used by target LowerCall hooks, this helps legalize find the
1172   // CALLSEQ_BEGIN node.
1173   ArgChains.push_back(Chain);
1174 
1175   // Add a chain value for each stack argument corresponding
1176   for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1177     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1178       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1179         if (FI->getIndex() < 0) {
1180           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1181           int64_t InLastByte = InFirstByte;
1182           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1183 
1184           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1185               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1186             ArgChains.push_back(SDValue(L, 1));
1187         }
1188       }
1189     }
1190   }
1191 
1192   // Build a tokenfactor for all the chains.
1193   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1194 }
1195 
1196 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1197                                                  SmallVectorImpl<SDValue> &InVals,
1198                                                  StringRef Reason) const {
1199   SDValue Callee = CLI.Callee;
1200   SelectionDAG &DAG = CLI.DAG;
1201 
1202   const Function &Fn = DAG.getMachineFunction().getFunction();
1203 
1204   StringRef FuncName("<unknown>");
1205 
1206   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1207     FuncName = G->getSymbol();
1208   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1209     FuncName = G->getGlobal()->getName();
1210 
1211   DiagnosticInfoUnsupported NoCalls(
1212     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1213   DAG.getContext()->diagnose(NoCalls);
1214 
1215   if (!CLI.IsTailCall) {
1216     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1217       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1218   }
1219 
1220   return DAG.getEntryNode();
1221 }
1222 
1223 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1224                                         SmallVectorImpl<SDValue> &InVals) const {
1225   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1226 }
1227 
1228 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1229                                                       SelectionDAG &DAG) const {
1230   const Function &Fn = DAG.getMachineFunction().getFunction();
1231 
1232   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1233                                             SDLoc(Op).getDebugLoc());
1234   DAG.getContext()->diagnose(NoDynamicAlloca);
1235   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1236   return DAG.getMergeValues(Ops, SDLoc());
1237 }
1238 
1239 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1240                                              SelectionDAG &DAG) const {
1241   switch (Op.getOpcode()) {
1242   default:
1243     Op->print(errs(), &DAG);
1244     llvm_unreachable("Custom lowering code for this "
1245                      "instruction is not implemented yet!");
1246     break;
1247   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1248   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1249   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1250   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1251   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1252   case ISD::FREM: return LowerFREM(Op, DAG);
1253   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1254   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1255   case ISD::FRINT: return LowerFRINT(Op, DAG);
1256   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1257   case ISD::FROUNDEVEN:
1258     return LowerFROUNDEVEN(Op, DAG);
1259   case ISD::FROUND: return LowerFROUND(Op, DAG);
1260   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1261   case ISD::FLOG:
1262     return LowerFLOG(Op, DAG, numbers::ln2f);
1263   case ISD::FLOG10:
1264     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1265   case ISD::FEXP:
1266     return lowerFEXP(Op, DAG);
1267   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1268   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1269   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1270   case ISD::FP_TO_SINT:
1271   case ISD::FP_TO_UINT:
1272     return LowerFP_TO_INT(Op, DAG);
1273   case ISD::CTTZ:
1274   case ISD::CTTZ_ZERO_UNDEF:
1275   case ISD::CTLZ:
1276   case ISD::CTLZ_ZERO_UNDEF:
1277     return LowerCTLZ_CTTZ(Op, DAG);
1278   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1279   }
1280   return Op;
1281 }
1282 
1283 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1284                                               SmallVectorImpl<SDValue> &Results,
1285                                               SelectionDAG &DAG) const {
1286   switch (N->getOpcode()) {
1287   case ISD::SIGN_EXTEND_INREG:
1288     // Different parts of legalization seem to interpret which type of
1289     // sign_extend_inreg is the one to check for custom lowering. The extended
1290     // from type is what really matters, but some places check for custom
1291     // lowering of the result type. This results in trying to use
1292     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1293     // nothing here and let the illegal result integer be handled normally.
1294     return;
1295   default:
1296     return;
1297   }
1298 }
1299 
1300 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1301                                                  SDValue Op,
1302                                                  SelectionDAG &DAG) const {
1303 
1304   const DataLayout &DL = DAG.getDataLayout();
1305   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1306   const GlobalValue *GV = G->getGlobal();
1307 
1308   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1309       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1310     if (!MFI->isModuleEntryFunction() &&
1311         !GV->getName().equals("llvm.amdgcn.module.lds")) {
1312       SDLoc DL(Op);
1313       const Function &Fn = DAG.getMachineFunction().getFunction();
1314       DiagnosticInfoUnsupported BadLDSDecl(
1315         Fn, "local memory global used by non-kernel function",
1316         DL.getDebugLoc(), DS_Warning);
1317       DAG.getContext()->diagnose(BadLDSDecl);
1318 
1319       // We currently don't have a way to correctly allocate LDS objects that
1320       // aren't directly associated with a kernel. We do force inlining of
1321       // functions that use local objects. However, if these dead functions are
1322       // not eliminated, we don't want a compile time error. Just emit a warning
1323       // and a trap, since there should be no callable path here.
1324       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1325       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1326                                         Trap, DAG.getRoot());
1327       DAG.setRoot(OutputChain);
1328       return DAG.getUNDEF(Op.getValueType());
1329     }
1330 
1331     // XXX: What does the value of G->getOffset() mean?
1332     assert(G->getOffset() == 0 &&
1333          "Do not know what to do with an non-zero offset");
1334 
1335     // TODO: We could emit code to handle the initialization somewhere.
1336     // We ignore the initializer for now and legalize it to allow selection.
1337     // The initializer will anyway get errored out during assembly emission.
1338     unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1339     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1340   }
1341   return SDValue();
1342 }
1343 
1344 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1345                                                   SelectionDAG &DAG) const {
1346   SmallVector<SDValue, 8> Args;
1347   SDLoc SL(Op);
1348 
1349   EVT VT = Op.getValueType();
1350   if (VT.getVectorElementType().getSizeInBits() < 32) {
1351     unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1352     if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1353       unsigned NewNumElt = OpBitSize / 32;
1354       EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1355                                       : EVT::getVectorVT(*DAG.getContext(),
1356                                                          MVT::i32, NewNumElt);
1357       for (const SDUse &U : Op->ops()) {
1358         SDValue In = U.get();
1359         SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1360         if (NewNumElt > 1)
1361           DAG.ExtractVectorElements(NewIn, Args);
1362         else
1363           Args.push_back(NewIn);
1364       }
1365 
1366       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1367                                    NewNumElt * Op.getNumOperands());
1368       SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1369       return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1370     }
1371   }
1372 
1373   for (const SDUse &U : Op->ops())
1374     DAG.ExtractVectorElements(U.get(), Args);
1375 
1376   return DAG.getBuildVector(Op.getValueType(), SL, Args);
1377 }
1378 
1379 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1380                                                      SelectionDAG &DAG) const {
1381 
1382   SmallVector<SDValue, 8> Args;
1383   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1384   EVT VT = Op.getValueType();
1385   EVT SrcVT = Op.getOperand(0).getValueType();
1386 
1387   // For these types, we have some TableGen patterns except if the index is 1
1388   if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1389        (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1390       Start != 1)
1391     return Op;
1392 
1393   if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1394        (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1395       (Start == 0 || Start == 4))
1396     return Op;
1397 
1398   if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1399        (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1400       (Start == 0 || Start == 8))
1401     return Op;
1402 
1403   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1404                             VT.getVectorNumElements());
1405 
1406   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1407 }
1408 
1409 /// Generate Min/Max node
1410 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1411                                                    SDValue LHS, SDValue RHS,
1412                                                    SDValue True, SDValue False,
1413                                                    SDValue CC,
1414                                                    DAGCombinerInfo &DCI) const {
1415   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1416     return SDValue();
1417 
1418   SelectionDAG &DAG = DCI.DAG;
1419   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1420   switch (CCOpcode) {
1421   case ISD::SETOEQ:
1422   case ISD::SETONE:
1423   case ISD::SETUNE:
1424   case ISD::SETNE:
1425   case ISD::SETUEQ:
1426   case ISD::SETEQ:
1427   case ISD::SETFALSE:
1428   case ISD::SETFALSE2:
1429   case ISD::SETTRUE:
1430   case ISD::SETTRUE2:
1431   case ISD::SETUO:
1432   case ISD::SETO:
1433     break;
1434   case ISD::SETULE:
1435   case ISD::SETULT: {
1436     if (LHS == True)
1437       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1438     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1439   }
1440   case ISD::SETOLE:
1441   case ISD::SETOLT:
1442   case ISD::SETLE:
1443   case ISD::SETLT: {
1444     // Ordered. Assume ordered for undefined.
1445 
1446     // Only do this after legalization to avoid interfering with other combines
1447     // which might occur.
1448     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1449         !DCI.isCalledByLegalizer())
1450       return SDValue();
1451 
1452     // We need to permute the operands to get the correct NaN behavior. The
1453     // selected operand is the second one based on the failing compare with NaN,
1454     // so permute it based on the compare type the hardware uses.
1455     if (LHS == True)
1456       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1457     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1458   }
1459   case ISD::SETUGE:
1460   case ISD::SETUGT: {
1461     if (LHS == True)
1462       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1463     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1464   }
1465   case ISD::SETGT:
1466   case ISD::SETGE:
1467   case ISD::SETOGE:
1468   case ISD::SETOGT: {
1469     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1470         !DCI.isCalledByLegalizer())
1471       return SDValue();
1472 
1473     if (LHS == True)
1474       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1475     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1476   }
1477   case ISD::SETCC_INVALID:
1478     llvm_unreachable("Invalid setcc condcode!");
1479   }
1480   return SDValue();
1481 }
1482 
1483 std::pair<SDValue, SDValue>
1484 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1485   SDLoc SL(Op);
1486 
1487   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1488 
1489   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1490   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1491 
1492   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1493   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1494 
1495   return std::pair(Lo, Hi);
1496 }
1497 
1498 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1499   SDLoc SL(Op);
1500 
1501   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1502   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1503   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1504 }
1505 
1506 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1507   SDLoc SL(Op);
1508 
1509   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1510   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1511   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1512 }
1513 
1514 // Split a vector type into two parts. The first part is a power of two vector.
1515 // The second part is whatever is left over, and is a scalar if it would
1516 // otherwise be a 1-vector.
1517 std::pair<EVT, EVT>
1518 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1519   EVT LoVT, HiVT;
1520   EVT EltVT = VT.getVectorElementType();
1521   unsigned NumElts = VT.getVectorNumElements();
1522   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1523   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1524   HiVT = NumElts - LoNumElts == 1
1525              ? EltVT
1526              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1527   return std::pair(LoVT, HiVT);
1528 }
1529 
1530 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1531 // scalar.
1532 std::pair<SDValue, SDValue>
1533 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1534                                   const EVT &LoVT, const EVT &HiVT,
1535                                   SelectionDAG &DAG) const {
1536   assert(LoVT.getVectorNumElements() +
1537                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1538              N.getValueType().getVectorNumElements() &&
1539          "More vector elements requested than available!");
1540   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1541                            DAG.getVectorIdxConstant(0, DL));
1542   SDValue Hi = DAG.getNode(
1543       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1544       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1545   return std::pair(Lo, Hi);
1546 }
1547 
1548 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1549                                               SelectionDAG &DAG) const {
1550   LoadSDNode *Load = cast<LoadSDNode>(Op);
1551   EVT VT = Op.getValueType();
1552   SDLoc SL(Op);
1553 
1554 
1555   // If this is a 2 element vector, we really want to scalarize and not create
1556   // weird 1 element vectors.
1557   if (VT.getVectorNumElements() == 2) {
1558     SDValue Ops[2];
1559     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1560     return DAG.getMergeValues(Ops, SL);
1561   }
1562 
1563   SDValue BasePtr = Load->getBasePtr();
1564   EVT MemVT = Load->getMemoryVT();
1565 
1566   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1567 
1568   EVT LoVT, HiVT;
1569   EVT LoMemVT, HiMemVT;
1570   SDValue Lo, Hi;
1571 
1572   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1573   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1574   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1575 
1576   unsigned Size = LoMemVT.getStoreSize();
1577   Align BaseAlign = Load->getAlign();
1578   Align HiAlign = commonAlignment(BaseAlign, Size);
1579 
1580   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1581                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1582                                   BaseAlign, Load->getMemOperand()->getFlags());
1583   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1584   SDValue HiLoad =
1585       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1586                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1587                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1588 
1589   SDValue Join;
1590   if (LoVT == HiVT) {
1591     // This is the case that the vector is power of two so was evenly split.
1592     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1593   } else {
1594     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1595                        DAG.getVectorIdxConstant(0, SL));
1596     Join = DAG.getNode(
1597         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1598         VT, Join, HiLoad,
1599         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1600   }
1601 
1602   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1603                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1604 
1605   return DAG.getMergeValues(Ops, SL);
1606 }
1607 
1608 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1609                                                      SelectionDAG &DAG) const {
1610   LoadSDNode *Load = cast<LoadSDNode>(Op);
1611   EVT VT = Op.getValueType();
1612   SDValue BasePtr = Load->getBasePtr();
1613   EVT MemVT = Load->getMemoryVT();
1614   SDLoc SL(Op);
1615   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1616   Align BaseAlign = Load->getAlign();
1617   unsigned NumElements = MemVT.getVectorNumElements();
1618 
1619   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1620   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1621   if (NumElements != 3 ||
1622       (BaseAlign < Align(8) &&
1623        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1624     return SplitVectorLoad(Op, DAG);
1625 
1626   assert(NumElements == 3);
1627 
1628   EVT WideVT =
1629       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1630   EVT WideMemVT =
1631       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1632   SDValue WideLoad = DAG.getExtLoad(
1633       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1634       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1635   return DAG.getMergeValues(
1636       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1637                    DAG.getVectorIdxConstant(0, SL)),
1638        WideLoad.getValue(1)},
1639       SL);
1640 }
1641 
1642 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1643                                                SelectionDAG &DAG) const {
1644   StoreSDNode *Store = cast<StoreSDNode>(Op);
1645   SDValue Val = Store->getValue();
1646   EVT VT = Val.getValueType();
1647 
1648   // If this is a 2 element vector, we really want to scalarize and not create
1649   // weird 1 element vectors.
1650   if (VT.getVectorNumElements() == 2)
1651     return scalarizeVectorStore(Store, DAG);
1652 
1653   EVT MemVT = Store->getMemoryVT();
1654   SDValue Chain = Store->getChain();
1655   SDValue BasePtr = Store->getBasePtr();
1656   SDLoc SL(Op);
1657 
1658   EVT LoVT, HiVT;
1659   EVT LoMemVT, HiMemVT;
1660   SDValue Lo, Hi;
1661 
1662   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1663   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1664   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1665 
1666   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1667 
1668   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1669   Align BaseAlign = Store->getAlign();
1670   unsigned Size = LoMemVT.getStoreSize();
1671   Align HiAlign = commonAlignment(BaseAlign, Size);
1672 
1673   SDValue LoStore =
1674       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1675                         Store->getMemOperand()->getFlags());
1676   SDValue HiStore =
1677       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1678                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1679 
1680   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1681 }
1682 
1683 // This is a shortcut for integer division because we have fast i32<->f32
1684 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1685 // float is enough to accurately represent up to a 24-bit signed integer.
1686 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1687                                             bool Sign) const {
1688   SDLoc DL(Op);
1689   EVT VT = Op.getValueType();
1690   SDValue LHS = Op.getOperand(0);
1691   SDValue RHS = Op.getOperand(1);
1692   MVT IntVT = MVT::i32;
1693   MVT FltVT = MVT::f32;
1694 
1695   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1696   if (LHSSignBits < 9)
1697     return SDValue();
1698 
1699   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1700   if (RHSSignBits < 9)
1701     return SDValue();
1702 
1703   unsigned BitSize = VT.getSizeInBits();
1704   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1705   unsigned DivBits = BitSize - SignBits;
1706   if (Sign)
1707     ++DivBits;
1708 
1709   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1710   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1711 
1712   SDValue jq = DAG.getConstant(1, DL, IntVT);
1713 
1714   if (Sign) {
1715     // char|short jq = ia ^ ib;
1716     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1717 
1718     // jq = jq >> (bitsize - 2)
1719     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1720                      DAG.getConstant(BitSize - 2, DL, VT));
1721 
1722     // jq = jq | 0x1
1723     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1724   }
1725 
1726   // int ia = (int)LHS;
1727   SDValue ia = LHS;
1728 
1729   // int ib, (int)RHS;
1730   SDValue ib = RHS;
1731 
1732   // float fa = (float)ia;
1733   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1734 
1735   // float fb = (float)ib;
1736   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1737 
1738   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1739                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1740 
1741   // fq = trunc(fq);
1742   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1743 
1744   // float fqneg = -fq;
1745   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1746 
1747   MachineFunction &MF = DAG.getMachineFunction();
1748 
1749   bool UseFmadFtz = false;
1750   if (Subtarget->isGCN()) {
1751     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1752     UseFmadFtz = MFI->getMode().allFP32Denormals();
1753   }
1754 
1755   // float fr = mad(fqneg, fb, fa);
1756   unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1757                     : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1758                                  : (unsigned)ISD::FMAD;
1759   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1760 
1761   // int iq = (int)fq;
1762   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1763 
1764   // fr = fabs(fr);
1765   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1766 
1767   // fb = fabs(fb);
1768   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1769 
1770   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1771 
1772   // int cv = fr >= fb;
1773   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1774 
1775   // jq = (cv ? jq : 0);
1776   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1777 
1778   // dst = iq + jq;
1779   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1780 
1781   // Rem needs compensation, it's easier to recompute it
1782   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1783   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1784 
1785   // Truncate to number of bits this divide really is.
1786   if (Sign) {
1787     SDValue InRegSize
1788       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1789     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1790     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1791   } else {
1792     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1793     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1794     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1795   }
1796 
1797   return DAG.getMergeValues({ Div, Rem }, DL);
1798 }
1799 
1800 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1801                                       SelectionDAG &DAG,
1802                                       SmallVectorImpl<SDValue> &Results) const {
1803   SDLoc DL(Op);
1804   EVT VT = Op.getValueType();
1805 
1806   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1807 
1808   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1809 
1810   SDValue One = DAG.getConstant(1, DL, HalfVT);
1811   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1812 
1813   //HiLo split
1814   SDValue LHS = Op.getOperand(0);
1815   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1816   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1817 
1818   SDValue RHS = Op.getOperand(1);
1819   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1820   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1821 
1822   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1823       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1824 
1825     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1826                               LHS_Lo, RHS_Lo);
1827 
1828     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1829     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1830 
1831     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1832     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1833     return;
1834   }
1835 
1836   if (isTypeLegal(MVT::i64)) {
1837     // The algorithm here is based on ideas from "Software Integer Division",
1838     // Tom Rodeheffer, August 2008.
1839 
1840     MachineFunction &MF = DAG.getMachineFunction();
1841     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1842 
1843     // Compute denominator reciprocal.
1844     unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1845                     (unsigned)ISD::FMA :
1846                     !MFI->getMode().allFP32Denormals() ?
1847                     (unsigned)ISD::FMAD :
1848                     (unsigned)AMDGPUISD::FMAD_FTZ;
1849 
1850     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1851     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1852     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1853       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1854       Cvt_Lo);
1855     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1856     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1857       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1858     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1859       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1860     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1861     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1862       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1863       Mul1);
1864     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1865     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1866     SDValue Rcp64 = DAG.getBitcast(VT,
1867                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1868 
1869     SDValue Zero64 = DAG.getConstant(0, DL, VT);
1870     SDValue One64  = DAG.getConstant(1, DL, VT);
1871     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1872     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1873 
1874     // First round of UNR (Unsigned integer Newton-Raphson).
1875     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1876     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1877     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1878     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1879                                     Zero);
1880     SDValue Mulhi1_Hi =
1881         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1882     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1883                                   Mulhi1_Lo, Zero1);
1884     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1885                                   Mulhi1_Hi, Add1_Lo.getValue(1));
1886     SDValue Add1 = DAG.getBitcast(VT,
1887                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1888 
1889     // Second round of UNR.
1890     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1891     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1892     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1893                                     Zero);
1894     SDValue Mulhi2_Hi =
1895         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1896     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1897                                   Mulhi2_Lo, Zero1);
1898     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1899                                   Mulhi2_Hi, Add2_Lo.getValue(1));
1900     SDValue Add2 = DAG.getBitcast(VT,
1901                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1902 
1903     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1904 
1905     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1906 
1907     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1908     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1909     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1910                                   Mul3_Lo, Zero1);
1911     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1912                                   Mul3_Hi, Sub1_Lo.getValue(1));
1913     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1914     SDValue Sub1 = DAG.getBitcast(VT,
1915                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1916 
1917     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1918     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1919                                  ISD::SETUGE);
1920     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1921                                  ISD::SETUGE);
1922     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1923 
1924     // TODO: Here and below portions of the code can be enclosed into if/endif.
1925     // Currently control flow is unconditional and we have 4 selects after
1926     // potential endif to substitute PHIs.
1927 
1928     // if C3 != 0 ...
1929     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1930                                   RHS_Lo, Zero1);
1931     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1932                                   RHS_Hi, Sub1_Lo.getValue(1));
1933     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1934                                   Zero, Sub2_Lo.getValue(1));
1935     SDValue Sub2 = DAG.getBitcast(VT,
1936                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1937 
1938     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1939 
1940     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1941                                  ISD::SETUGE);
1942     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1943                                  ISD::SETUGE);
1944     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1945 
1946     // if (C6 != 0)
1947     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1948 
1949     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1950                                   RHS_Lo, Zero1);
1951     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1952                                   RHS_Hi, Sub2_Lo.getValue(1));
1953     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1954                                   Zero, Sub3_Lo.getValue(1));
1955     SDValue Sub3 = DAG.getBitcast(VT,
1956                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1957 
1958     // endif C6
1959     // endif C3
1960 
1961     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1962     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1963 
1964     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1965     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1966 
1967     Results.push_back(Div);
1968     Results.push_back(Rem);
1969 
1970     return;
1971   }
1972 
1973   // r600 expandion.
1974   // Get Speculative values
1975   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1976   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1977 
1978   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1979   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1980   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1981 
1982   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1983   SDValue DIV_Lo = Zero;
1984 
1985   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1986 
1987   for (unsigned i = 0; i < halfBitWidth; ++i) {
1988     const unsigned bitPos = halfBitWidth - i - 1;
1989     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1990     // Get value of high bit
1991     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1992     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1993     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1994 
1995     // Shift
1996     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1997     // Add LHS high bit
1998     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1999 
2000     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2001     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2002 
2003     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2004 
2005     // Update REM
2006     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2007     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2008   }
2009 
2010   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2011   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2012   Results.push_back(DIV);
2013   Results.push_back(REM);
2014 }
2015 
2016 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2017                                            SelectionDAG &DAG) const {
2018   SDLoc DL(Op);
2019   EVT VT = Op.getValueType();
2020 
2021   if (VT == MVT::i64) {
2022     SmallVector<SDValue, 2> Results;
2023     LowerUDIVREM64(Op, DAG, Results);
2024     return DAG.getMergeValues(Results, DL);
2025   }
2026 
2027   if (VT == MVT::i32) {
2028     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2029       return Res;
2030   }
2031 
2032   SDValue X = Op.getOperand(0);
2033   SDValue Y = Op.getOperand(1);
2034 
2035   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2036   // algorithm used here.
2037 
2038   // Initial estimate of inv(y).
2039   SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2040 
2041   // One round of UNR.
2042   SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2043   SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2044   Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2045                   DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2046 
2047   // Quotient/remainder estimate.
2048   SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2049   SDValue R =
2050       DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2051 
2052   // First quotient/remainder refinement.
2053   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2054   SDValue One = DAG.getConstant(1, DL, VT);
2055   SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2056   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2057                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2058   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2059                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2060 
2061   // Second quotient/remainder refinement.
2062   Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2063   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2064                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2065   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2066                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2067 
2068   return DAG.getMergeValues({Q, R}, DL);
2069 }
2070 
2071 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2072                                            SelectionDAG &DAG) const {
2073   SDLoc DL(Op);
2074   EVT VT = Op.getValueType();
2075 
2076   SDValue LHS = Op.getOperand(0);
2077   SDValue RHS = Op.getOperand(1);
2078 
2079   SDValue Zero = DAG.getConstant(0, DL, VT);
2080   SDValue NegOne = DAG.getConstant(-1, DL, VT);
2081 
2082   if (VT == MVT::i32) {
2083     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2084       return Res;
2085   }
2086 
2087   if (VT == MVT::i64 &&
2088       DAG.ComputeNumSignBits(LHS) > 32 &&
2089       DAG.ComputeNumSignBits(RHS) > 32) {
2090     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2091 
2092     //HiLo split
2093     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2094     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2095     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2096                                  LHS_Lo, RHS_Lo);
2097     SDValue Res[2] = {
2098       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2099       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2100     };
2101     return DAG.getMergeValues(Res, DL);
2102   }
2103 
2104   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2105   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2106   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2107   SDValue RSign = LHSign; // Remainder sign is the same as LHS
2108 
2109   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2110   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2111 
2112   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2113   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2114 
2115   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2116   SDValue Rem = Div.getValue(1);
2117 
2118   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2119   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2120 
2121   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2122   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2123 
2124   SDValue Res[2] = {
2125     Div,
2126     Rem
2127   };
2128   return DAG.getMergeValues(Res, DL);
2129 }
2130 
2131 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2132 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2133   SDLoc SL(Op);
2134   EVT VT = Op.getValueType();
2135   auto Flags = Op->getFlags();
2136   SDValue X = Op.getOperand(0);
2137   SDValue Y = Op.getOperand(1);
2138 
2139   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2140   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2141   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2142   // TODO: For f32 use FMAD instead if !hasFastFMA32?
2143   return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2144 }
2145 
2146 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2147   SDLoc SL(Op);
2148   SDValue Src = Op.getOperand(0);
2149 
2150   // result = trunc(src)
2151   // if (src > 0.0 && src != result)
2152   //   result += 1.0
2153 
2154   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2155 
2156   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2157   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2158 
2159   EVT SetCCVT =
2160       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2161 
2162   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2163   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2164   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2165 
2166   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2167   // TODO: Should this propagate fast-math-flags?
2168   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2169 }
2170 
2171 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2172                                   SelectionDAG &DAG) {
2173   const unsigned FractBits = 52;
2174   const unsigned ExpBits = 11;
2175 
2176   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2177                                 Hi,
2178                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2179                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2180   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2181                             DAG.getConstant(1023, SL, MVT::i32));
2182 
2183   return Exp;
2184 }
2185 
2186 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2187   SDLoc SL(Op);
2188   SDValue Src = Op.getOperand(0);
2189 
2190   assert(Op.getValueType() == MVT::f64);
2191 
2192   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2193 
2194   // Extract the upper half, since this is where we will find the sign and
2195   // exponent.
2196   SDValue Hi = getHiHalf64(Src, DAG);
2197 
2198   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2199 
2200   const unsigned FractBits = 52;
2201 
2202   // Extract the sign bit.
2203   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2204   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2205 
2206   // Extend back to 64-bits.
2207   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2208   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2209 
2210   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2211   const SDValue FractMask
2212     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2213 
2214   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2215   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2216   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2217 
2218   EVT SetCCVT =
2219       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2220 
2221   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2222 
2223   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2224   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2225 
2226   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2227   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2228 
2229   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2230 }
2231 
2232 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2233   SDLoc SL(Op);
2234   SDValue Src = Op.getOperand(0);
2235 
2236   assert(Op.getValueType() == MVT::f64);
2237 
2238   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2239   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2240   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2241 
2242   // TODO: Should this propagate fast-math-flags?
2243 
2244   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2245   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2246 
2247   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2248 
2249   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2250   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2251 
2252   EVT SetCCVT =
2253       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2254   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2255 
2256   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2257 }
2258 
2259 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2260   // FNEARBYINT and FRINT are the same, except in their handling of FP
2261   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2262   // rint, so just treat them as equivalent.
2263   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2264 }
2265 
2266 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2267                                               SelectionDAG &DAG) const {
2268   auto VT = Op.getValueType();
2269   auto Arg = Op.getOperand(0u);
2270   return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2271 }
2272 
2273 // XXX - May require not supporting f32 denormals?
2274 
2275 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2276 // compare and vselect end up producing worse code than scalarizing the whole
2277 // operation.
2278 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2279   SDLoc SL(Op);
2280   SDValue X = Op.getOperand(0);
2281   EVT VT = Op.getValueType();
2282 
2283   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2284 
2285   // TODO: Should this propagate fast-math-flags?
2286 
2287   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2288 
2289   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2290 
2291   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2292   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2293   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2294 
2295   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2296 
2297   EVT SetCCVT =
2298       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2299 
2300   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2301 
2302   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2303 
2304   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2305 }
2306 
2307 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2308   SDLoc SL(Op);
2309   SDValue Src = Op.getOperand(0);
2310 
2311   // result = trunc(src);
2312   // if (src < 0.0 && src != result)
2313   //   result += -1.0.
2314 
2315   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2316 
2317   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2318   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2319 
2320   EVT SetCCVT =
2321       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2322 
2323   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2324   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2325   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2326 
2327   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2328   // TODO: Should this propagate fast-math-flags?
2329   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2330 }
2331 
2332 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2333                                         double Log2BaseInverted) const {
2334   EVT VT = Op.getValueType();
2335 
2336   SDLoc SL(Op);
2337   SDValue Operand = Op.getOperand(0);
2338   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2339   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2340 
2341   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2342 }
2343 
2344 // exp2(M_LOG2E_F * f);
2345 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2346   EVT VT = Op.getValueType();
2347   SDLoc SL(Op);
2348   SDValue Src = Op.getOperand(0);
2349 
2350   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2351   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2352   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2353 }
2354 
2355 static bool isCtlzOpc(unsigned Opc) {
2356   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2357 }
2358 
2359 static bool isCttzOpc(unsigned Opc) {
2360   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2361 }
2362 
2363 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2364   SDLoc SL(Op);
2365   SDValue Src = Op.getOperand(0);
2366 
2367   assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2368   bool Ctlz = isCtlzOpc(Op.getOpcode());
2369   unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2370 
2371   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2372                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2373 
2374   if (Src.getValueType() == MVT::i32) {
2375     // (ctlz hi:lo) -> (umin (ffbh src), 32)
2376     // (cttz hi:lo) -> (umin (ffbl src), 32)
2377     // (ctlz_zero_undef src) -> (ffbh src)
2378     // (cttz_zero_undef src) -> (ffbl src)
2379     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2380     if (!ZeroUndef) {
2381       const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2382       NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2383     }
2384     return NewOpr;
2385   }
2386 
2387   SDValue Lo, Hi;
2388   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2389 
2390   SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2391   SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2392 
2393   // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2394   // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2395   // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2396   // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2397 
2398   unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2399   const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2400   if (Ctlz)
2401     OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2402   else
2403     OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2404 
2405   SDValue NewOpr;
2406   NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2407   if (!ZeroUndef) {
2408     const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2409     NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2410   }
2411 
2412   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2413 }
2414 
2415 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2416                                                bool Signed) const {
2417   // The regular method converting a 64-bit integer to float roughly consists of
2418   // 2 steps: normalization and rounding. In fact, after normalization, the
2419   // conversion from a 64-bit integer to a float is essentially the same as the
2420   // one from a 32-bit integer. The only difference is that it has more
2421   // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2422   // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2423   // converted into the correct float number. The basic steps for the unsigned
2424   // conversion are illustrated in the following pseudo code:
2425   //
2426   // f32 uitofp(i64 u) {
2427   //   i32 hi, lo = split(u);
2428   //   // Only count the leading zeros in hi as we have native support of the
2429   //   // conversion from i32 to f32. If hi is all 0s, the conversion is
2430   //   // reduced to a 32-bit one automatically.
2431   //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2432   //   u <<= shamt;
2433   //   hi, lo = split(u);
2434   //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2435   //   // convert it as a 32-bit integer and scale the result back.
2436   //   return uitofp(hi) * 2^(32 - shamt);
2437   // }
2438   //
2439   // The signed one follows the same principle but uses 'ffbh_i32' to count its
2440   // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2441   // converted instead followed by negation based its sign bit.
2442 
2443   SDLoc SL(Op);
2444   SDValue Src = Op.getOperand(0);
2445 
2446   SDValue Lo, Hi;
2447   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2448   SDValue Sign;
2449   SDValue ShAmt;
2450   if (Signed && Subtarget->isGCN()) {
2451     // We also need to consider the sign bit in Lo if Hi has just sign bits,
2452     // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2453     // account. That is, the maximal shift is
2454     // - 32 if Lo and Hi have opposite signs;
2455     // - 33 if Lo and Hi have the same sign.
2456     //
2457     // Or, MaxShAmt = 33 + OppositeSign, where
2458     //
2459     // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2460     // - -1 if Lo and Hi have opposite signs; and
2461     // -  0 otherwise.
2462     //
2463     // All in all, ShAmt is calculated as
2464     //
2465     //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2466     //
2467     // or
2468     //
2469     //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2470     //
2471     // to reduce the critical path.
2472     SDValue OppositeSign = DAG.getNode(
2473         ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2474         DAG.getConstant(31, SL, MVT::i32));
2475     SDValue MaxShAmt =
2476         DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2477                     OppositeSign);
2478     // Count the leading sign bits.
2479     ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2480     // Different from unsigned conversion, the shift should be one bit less to
2481     // preserve the sign bit.
2482     ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2483                         DAG.getConstant(1, SL, MVT::i32));
2484     ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2485   } else {
2486     if (Signed) {
2487       // Without 'ffbh_i32', only leading zeros could be counted. Take the
2488       // absolute value first.
2489       Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2490                          DAG.getConstant(63, SL, MVT::i64));
2491       SDValue Abs =
2492           DAG.getNode(ISD::XOR, SL, MVT::i64,
2493                       DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2494       std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2495     }
2496     // Count the leading zeros.
2497     ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2498     // The shift amount for signed integers is [0, 32].
2499   }
2500   // Normalize the given 64-bit integer.
2501   SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2502   // Split it again.
2503   std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2504   // Calculate the adjust bit for rounding.
2505   // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2506   SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2507                                DAG.getConstant(1, SL, MVT::i32), Lo);
2508   // Get the 32-bit normalized integer.
2509   Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2510   // Convert the normalized 32-bit integer into f32.
2511   unsigned Opc =
2512       (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2513   SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2514 
2515   // Finally, need to scale back the converted floating number as the original
2516   // 64-bit integer is converted as a 32-bit one.
2517   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2518                       ShAmt);
2519   // On GCN, use LDEXP directly.
2520   if (Subtarget->isGCN())
2521     return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2522 
2523   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2524   // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2525   // exponent is enough to avoid overflowing into the sign bit.
2526   SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2527                             DAG.getConstant(23, SL, MVT::i32));
2528   SDValue IVal =
2529       DAG.getNode(ISD::ADD, SL, MVT::i32,
2530                   DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2531   if (Signed) {
2532     // Set the sign bit.
2533     Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2534                        DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2535                        DAG.getConstant(31, SL, MVT::i32));
2536     IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2537   }
2538   return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2539 }
2540 
2541 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2542                                                bool Signed) const {
2543   SDLoc SL(Op);
2544   SDValue Src = Op.getOperand(0);
2545 
2546   SDValue Lo, Hi;
2547   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2548 
2549   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2550                               SL, MVT::f64, Hi);
2551 
2552   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2553 
2554   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2555                               DAG.getConstant(32, SL, MVT::i32));
2556   // TODO: Should this propagate fast-math-flags?
2557   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2558 }
2559 
2560 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2561                                                SelectionDAG &DAG) const {
2562   // TODO: Factor out code common with LowerSINT_TO_FP.
2563   EVT DestVT = Op.getValueType();
2564   SDValue Src = Op.getOperand(0);
2565   EVT SrcVT = Src.getValueType();
2566 
2567   if (SrcVT == MVT::i16) {
2568     if (DestVT == MVT::f16)
2569       return Op;
2570     SDLoc DL(Op);
2571 
2572     // Promote src to i32
2573     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2574     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2575   }
2576 
2577   assert(SrcVT == MVT::i64 && "operation should be legal");
2578 
2579   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2580     SDLoc DL(Op);
2581 
2582     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2583     SDValue FPRoundFlag =
2584         DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2585     SDValue FPRound =
2586         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2587 
2588     return FPRound;
2589   }
2590 
2591   if (DestVT == MVT::f32)
2592     return LowerINT_TO_FP32(Op, DAG, false);
2593 
2594   assert(DestVT == MVT::f64);
2595   return LowerINT_TO_FP64(Op, DAG, false);
2596 }
2597 
2598 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2599                                               SelectionDAG &DAG) const {
2600   EVT DestVT = Op.getValueType();
2601 
2602   SDValue Src = Op.getOperand(0);
2603   EVT SrcVT = Src.getValueType();
2604 
2605   if (SrcVT == MVT::i16) {
2606     if (DestVT == MVT::f16)
2607       return Op;
2608 
2609     SDLoc DL(Op);
2610     // Promote src to i32
2611     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2612     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2613   }
2614 
2615   assert(SrcVT == MVT::i64 && "operation should be legal");
2616 
2617   // TODO: Factor out code common with LowerUINT_TO_FP.
2618 
2619   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2620     SDLoc DL(Op);
2621     SDValue Src = Op.getOperand(0);
2622 
2623     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2624     SDValue FPRoundFlag =
2625         DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2626     SDValue FPRound =
2627         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2628 
2629     return FPRound;
2630   }
2631 
2632   if (DestVT == MVT::f32)
2633     return LowerINT_TO_FP32(Op, DAG, true);
2634 
2635   assert(DestVT == MVT::f64);
2636   return LowerINT_TO_FP64(Op, DAG, true);
2637 }
2638 
2639 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2640                                                bool Signed) const {
2641   SDLoc SL(Op);
2642 
2643   SDValue Src = Op.getOperand(0);
2644   EVT SrcVT = Src.getValueType();
2645 
2646   assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2647 
2648   // The basic idea of converting a floating point number into a pair of 32-bit
2649   // integers is illustrated as follows:
2650   //
2651   //     tf := trunc(val);
2652   //    hif := floor(tf * 2^-32);
2653   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2654   //     hi := fptoi(hif);
2655   //     lo := fptoi(lof);
2656   //
2657   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2658   SDValue Sign;
2659   if (Signed && SrcVT == MVT::f32) {
2660     // However, a 32-bit floating point number has only 23 bits mantissa and
2661     // it's not enough to hold all the significant bits of `lof` if val is
2662     // negative. To avoid the loss of precision, We need to take the absolute
2663     // value after truncating and flip the result back based on the original
2664     // signedness.
2665     Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2666                        DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2667                        DAG.getConstant(31, SL, MVT::i32));
2668     Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2669   }
2670 
2671   SDValue K0, K1;
2672   if (SrcVT == MVT::f64) {
2673     K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2674                            SL, SrcVT);
2675     K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2676                            SL, SrcVT);
2677   } else {
2678     K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2679                            SrcVT);
2680     K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2681                            SrcVT);
2682   }
2683   // TODO: Should this propagate fast-math-flags?
2684   SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2685 
2686   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2687 
2688   SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2689 
2690   SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2691                                                          : ISD::FP_TO_UINT,
2692                            SL, MVT::i32, FloorMul);
2693   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2694 
2695   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2696                                DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2697 
2698   if (Signed && SrcVT == MVT::f32) {
2699     assert(Sign);
2700     // Flip the result based on the signedness, which is either all 0s or 1s.
2701     Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2702                        DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2703     // r := xor(r, sign) - sign;
2704     Result =
2705         DAG.getNode(ISD::SUB, SL, MVT::i64,
2706                     DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2707   }
2708 
2709   return Result;
2710 }
2711 
2712 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2713   SDLoc DL(Op);
2714   SDValue N0 = Op.getOperand(0);
2715 
2716   // Convert to target node to get known bits
2717   if (N0.getValueType() == MVT::f32)
2718     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2719 
2720   if (getTargetMachine().Options.UnsafeFPMath) {
2721     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2722     return SDValue();
2723   }
2724 
2725   assert(N0.getSimpleValueType() == MVT::f64);
2726 
2727   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2728   const unsigned ExpMask = 0x7ff;
2729   const unsigned ExpBiasf64 = 1023;
2730   const unsigned ExpBiasf16 = 15;
2731   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2732   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2733   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2734   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2735                            DAG.getConstant(32, DL, MVT::i64));
2736   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2737   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2738   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2739                           DAG.getConstant(20, DL, MVT::i64));
2740   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2741                   DAG.getConstant(ExpMask, DL, MVT::i32));
2742   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2743   // add the f16 bias (15) to get the biased exponent for the f16 format.
2744   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2745                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2746 
2747   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2748                           DAG.getConstant(8, DL, MVT::i32));
2749   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2750                   DAG.getConstant(0xffe, DL, MVT::i32));
2751 
2752   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2753                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2754   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2755 
2756   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2757   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2758 
2759   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2760   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2761       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2762                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2763 
2764   // N = M | (E << 12);
2765   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2766       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2767                   DAG.getConstant(12, DL, MVT::i32)));
2768 
2769   // B = clamp(1-E, 0, 13);
2770   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2771                                   One, E);
2772   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2773   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2774                   DAG.getConstant(13, DL, MVT::i32));
2775 
2776   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2777                                    DAG.getConstant(0x1000, DL, MVT::i32));
2778 
2779   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2780   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2781   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2782   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2783 
2784   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2785   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2786                               DAG.getConstant(0x7, DL, MVT::i32));
2787   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2788                   DAG.getConstant(2, DL, MVT::i32));
2789   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2790                                One, Zero, ISD::SETEQ);
2791   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2792                                One, Zero, ISD::SETGT);
2793   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2794   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2795 
2796   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2797                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2798   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2799                       I, V, ISD::SETEQ);
2800 
2801   // Extract the sign bit.
2802   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2803                             DAG.getConstant(16, DL, MVT::i32));
2804   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2805                      DAG.getConstant(0x8000, DL, MVT::i32));
2806 
2807   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2808   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2809 }
2810 
2811 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2812                                              SelectionDAG &DAG) const {
2813   SDValue Src = Op.getOperand(0);
2814   unsigned OpOpcode = Op.getOpcode();
2815   EVT SrcVT = Src.getValueType();
2816   EVT DestVT = Op.getValueType();
2817 
2818   // Will be selected natively
2819   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2820     return Op;
2821 
2822   // Promote i16 to i32
2823   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2824     SDLoc DL(Op);
2825 
2826     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2827     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2828   }
2829 
2830   if (SrcVT == MVT::f16 ||
2831       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2832     SDLoc DL(Op);
2833 
2834     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2835     unsigned Ext =
2836         OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2837     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2838   }
2839 
2840   if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2841     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2842 
2843   return SDValue();
2844 }
2845 
2846 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2847                                                      SelectionDAG &DAG) const {
2848   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2849   MVT VT = Op.getSimpleValueType();
2850   MVT ScalarVT = VT.getScalarType();
2851 
2852   assert(VT.isVector());
2853 
2854   SDValue Src = Op.getOperand(0);
2855   SDLoc DL(Op);
2856 
2857   // TODO: Don't scalarize on Evergreen?
2858   unsigned NElts = VT.getVectorNumElements();
2859   SmallVector<SDValue, 8> Args;
2860   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2861 
2862   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2863   for (unsigned I = 0; I < NElts; ++I)
2864     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2865 
2866   return DAG.getBuildVector(VT, DL, Args);
2867 }
2868 
2869 //===----------------------------------------------------------------------===//
2870 // Custom DAG optimizations
2871 //===----------------------------------------------------------------------===//
2872 
2873 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2874   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2875 }
2876 
2877 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2878   EVT VT = Op.getValueType();
2879   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2880                                      // as unsigned 24-bit values.
2881          AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2882 }
2883 
2884 static SDValue simplifyMul24(SDNode *Node24,
2885                              TargetLowering::DAGCombinerInfo &DCI) {
2886   SelectionDAG &DAG = DCI.DAG;
2887   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2888   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2889 
2890   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2891   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2892   unsigned NewOpcode = Node24->getOpcode();
2893   if (IsIntrin) {
2894     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2895     switch (IID) {
2896     case Intrinsic::amdgcn_mul_i24:
2897       NewOpcode = AMDGPUISD::MUL_I24;
2898       break;
2899     case Intrinsic::amdgcn_mul_u24:
2900       NewOpcode = AMDGPUISD::MUL_U24;
2901       break;
2902     case Intrinsic::amdgcn_mulhi_i24:
2903       NewOpcode = AMDGPUISD::MULHI_I24;
2904       break;
2905     case Intrinsic::amdgcn_mulhi_u24:
2906       NewOpcode = AMDGPUISD::MULHI_U24;
2907       break;
2908     default:
2909       llvm_unreachable("Expected 24-bit mul intrinsic");
2910     }
2911   }
2912 
2913   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2914 
2915   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2916   // the operands to have other uses, but will only perform simplifications that
2917   // involve bypassing some nodes for this user.
2918   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2919   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2920   if (DemandedLHS || DemandedRHS)
2921     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2922                        DemandedLHS ? DemandedLHS : LHS,
2923                        DemandedRHS ? DemandedRHS : RHS);
2924 
2925   // Now try SimplifyDemandedBits which can simplify the nodes used by our
2926   // operands if this node is the only user.
2927   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2928     return SDValue(Node24, 0);
2929   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2930     return SDValue(Node24, 0);
2931 
2932   return SDValue();
2933 }
2934 
2935 template <typename IntTy>
2936 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2937                                uint32_t Width, const SDLoc &DL) {
2938   if (Width + Offset < 32) {
2939     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2940     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2941     return DAG.getConstant(Result, DL, MVT::i32);
2942   }
2943 
2944   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2945 }
2946 
2947 static bool hasVolatileUser(SDNode *Val) {
2948   for (SDNode *U : Val->uses()) {
2949     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2950       if (M->isVolatile())
2951         return true;
2952     }
2953   }
2954 
2955   return false;
2956 }
2957 
2958 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2959   // i32 vectors are the canonical memory type.
2960   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2961     return false;
2962 
2963   if (!VT.isByteSized())
2964     return false;
2965 
2966   unsigned Size = VT.getStoreSize();
2967 
2968   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2969     return false;
2970 
2971   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2972     return false;
2973 
2974   return true;
2975 }
2976 
2977 // Replace load of an illegal type with a store of a bitcast to a friendlier
2978 // type.
2979 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2980                                                  DAGCombinerInfo &DCI) const {
2981   if (!DCI.isBeforeLegalize())
2982     return SDValue();
2983 
2984   LoadSDNode *LN = cast<LoadSDNode>(N);
2985   if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2986     return SDValue();
2987 
2988   SDLoc SL(N);
2989   SelectionDAG &DAG = DCI.DAG;
2990   EVT VT = LN->getMemoryVT();
2991 
2992   unsigned Size = VT.getStoreSize();
2993   Align Alignment = LN->getAlign();
2994   if (Alignment < Size && isTypeLegal(VT)) {
2995     unsigned IsFast;
2996     unsigned AS = LN->getAddressSpace();
2997 
2998     // Expand unaligned loads earlier than legalization. Due to visitation order
2999     // problems during legalization, the emitted instructions to pack and unpack
3000     // the bytes again are not eliminated in the case of an unaligned copy.
3001     if (!allowsMisalignedMemoryAccesses(
3002             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3003       if (VT.isVector())
3004         return SplitVectorLoad(SDValue(LN, 0), DAG);
3005 
3006       SDValue Ops[2];
3007       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3008 
3009       return DAG.getMergeValues(Ops, SDLoc(N));
3010     }
3011 
3012     if (!IsFast)
3013       return SDValue();
3014   }
3015 
3016   if (!shouldCombineMemoryType(VT))
3017     return SDValue();
3018 
3019   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3020 
3021   SDValue NewLoad
3022     = DAG.getLoad(NewVT, SL, LN->getChain(),
3023                   LN->getBasePtr(), LN->getMemOperand());
3024 
3025   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3026   DCI.CombineTo(N, BC, NewLoad.getValue(1));
3027   return SDValue(N, 0);
3028 }
3029 
3030 // Replace store of an illegal type with a store of a bitcast to a friendlier
3031 // type.
3032 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3033                                                   DAGCombinerInfo &DCI) const {
3034   if (!DCI.isBeforeLegalize())
3035     return SDValue();
3036 
3037   StoreSDNode *SN = cast<StoreSDNode>(N);
3038   if (!SN->isSimple() || !ISD::isNormalStore(SN))
3039     return SDValue();
3040 
3041   EVT VT = SN->getMemoryVT();
3042   unsigned Size = VT.getStoreSize();
3043 
3044   SDLoc SL(N);
3045   SelectionDAG &DAG = DCI.DAG;
3046   Align Alignment = SN->getAlign();
3047   if (Alignment < Size && isTypeLegal(VT)) {
3048     unsigned IsFast;
3049     unsigned AS = SN->getAddressSpace();
3050 
3051     // Expand unaligned stores earlier than legalization. Due to visitation
3052     // order problems during legalization, the emitted instructions to pack and
3053     // unpack the bytes again are not eliminated in the case of an unaligned
3054     // copy.
3055     if (!allowsMisalignedMemoryAccesses(
3056             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3057       if (VT.isVector())
3058         return SplitVectorStore(SDValue(SN, 0), DAG);
3059 
3060       return expandUnalignedStore(SN, DAG);
3061     }
3062 
3063     if (!IsFast)
3064       return SDValue();
3065   }
3066 
3067   if (!shouldCombineMemoryType(VT))
3068     return SDValue();
3069 
3070   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3071   SDValue Val = SN->getValue();
3072 
3073   //DCI.AddToWorklist(Val.getNode());
3074 
3075   bool OtherUses = !Val.hasOneUse();
3076   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3077   if (OtherUses) {
3078     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3079     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3080   }
3081 
3082   return DAG.getStore(SN->getChain(), SL, CastVal,
3083                       SN->getBasePtr(), SN->getMemOperand());
3084 }
3085 
3086 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3087 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3088 // issues.
3089 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3090                                                         DAGCombinerInfo &DCI) const {
3091   SelectionDAG &DAG = DCI.DAG;
3092   SDValue N0 = N->getOperand(0);
3093 
3094   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3095   //     (vt2 (truncate (assertzext vt0:x, vt1)))
3096   if (N0.getOpcode() == ISD::TRUNCATE) {
3097     SDValue N1 = N->getOperand(1);
3098     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3099     SDLoc SL(N);
3100 
3101     SDValue Src = N0.getOperand(0);
3102     EVT SrcVT = Src.getValueType();
3103     if (SrcVT.bitsGE(ExtVT)) {
3104       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3105       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3106     }
3107   }
3108 
3109   return SDValue();
3110 }
3111 
3112 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3113   SDNode *N, DAGCombinerInfo &DCI) const {
3114   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3115   switch (IID) {
3116   case Intrinsic::amdgcn_mul_i24:
3117   case Intrinsic::amdgcn_mul_u24:
3118   case Intrinsic::amdgcn_mulhi_i24:
3119   case Intrinsic::amdgcn_mulhi_u24:
3120     return simplifyMul24(N, DCI);
3121   case Intrinsic::amdgcn_fract:
3122   case Intrinsic::amdgcn_rsq:
3123   case Intrinsic::amdgcn_rcp_legacy:
3124   case Intrinsic::amdgcn_rsq_legacy:
3125   case Intrinsic::amdgcn_rsq_clamp:
3126   case Intrinsic::amdgcn_ldexp: {
3127     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3128     SDValue Src = N->getOperand(1);
3129     return Src.isUndef() ? Src : SDValue();
3130   }
3131   default:
3132     return SDValue();
3133   }
3134 }
3135 
3136 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3137 /// binary operation \p Opc to it with the corresponding constant operands.
3138 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3139   DAGCombinerInfo &DCI, const SDLoc &SL,
3140   unsigned Opc, SDValue LHS,
3141   uint32_t ValLo, uint32_t ValHi) const {
3142   SelectionDAG &DAG = DCI.DAG;
3143   SDValue Lo, Hi;
3144   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3145 
3146   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3147   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3148 
3149   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3150   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3151 
3152   // Re-visit the ands. It's possible we eliminated one of them and it could
3153   // simplify the vector.
3154   DCI.AddToWorklist(Lo.getNode());
3155   DCI.AddToWorklist(Hi.getNode());
3156 
3157   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3158   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3159 }
3160 
3161 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3162                                                 DAGCombinerInfo &DCI) const {
3163   EVT VT = N->getValueType(0);
3164 
3165   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3166   if (!RHS)
3167     return SDValue();
3168 
3169   SDValue LHS = N->getOperand(0);
3170   unsigned RHSVal = RHS->getZExtValue();
3171   if (!RHSVal)
3172     return LHS;
3173 
3174   SDLoc SL(N);
3175   SelectionDAG &DAG = DCI.DAG;
3176 
3177   switch (LHS->getOpcode()) {
3178   default:
3179     break;
3180   case ISD::ZERO_EXTEND:
3181   case ISD::SIGN_EXTEND:
3182   case ISD::ANY_EXTEND: {
3183     SDValue X = LHS->getOperand(0);
3184 
3185     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3186         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3187       // Prefer build_vector as the canonical form if packed types are legal.
3188       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3189       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3190        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3191       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3192     }
3193 
3194     // shl (ext x) => zext (shl x), if shift does not overflow int
3195     if (VT != MVT::i64)
3196       break;
3197     KnownBits Known = DAG.computeKnownBits(X);
3198     unsigned LZ = Known.countMinLeadingZeros();
3199     if (LZ < RHSVal)
3200       break;
3201     EVT XVT = X.getValueType();
3202     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3203     return DAG.getZExtOrTrunc(Shl, SL, VT);
3204   }
3205   }
3206 
3207   if (VT != MVT::i64)
3208     return SDValue();
3209 
3210   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3211 
3212   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3213   // common case, splitting this into a move and a 32-bit shift is faster and
3214   // the same code size.
3215   if (RHSVal < 32)
3216     return SDValue();
3217 
3218   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3219 
3220   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3221   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3222 
3223   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3224 
3225   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3226   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3227 }
3228 
3229 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3230                                                 DAGCombinerInfo &DCI) const {
3231   if (N->getValueType(0) != MVT::i64)
3232     return SDValue();
3233 
3234   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3235   if (!RHS)
3236     return SDValue();
3237 
3238   SelectionDAG &DAG = DCI.DAG;
3239   SDLoc SL(N);
3240   unsigned RHSVal = RHS->getZExtValue();
3241 
3242   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3243   if (RHSVal == 32) {
3244     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3245     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3246                                    DAG.getConstant(31, SL, MVT::i32));
3247 
3248     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3249     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3250   }
3251 
3252   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3253   if (RHSVal == 63) {
3254     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3255     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3256                                    DAG.getConstant(31, SL, MVT::i32));
3257     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3258     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3259   }
3260 
3261   return SDValue();
3262 }
3263 
3264 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3265                                                 DAGCombinerInfo &DCI) const {
3266   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3267   if (!RHS)
3268     return SDValue();
3269 
3270   EVT VT = N->getValueType(0);
3271   SDValue LHS = N->getOperand(0);
3272   unsigned ShiftAmt = RHS->getZExtValue();
3273   SelectionDAG &DAG = DCI.DAG;
3274   SDLoc SL(N);
3275 
3276   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3277   // this improves the ability to match BFE patterns in isel.
3278   if (LHS.getOpcode() == ISD::AND) {
3279     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3280       unsigned MaskIdx, MaskLen;
3281       if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3282           MaskIdx == ShiftAmt) {
3283         return DAG.getNode(
3284             ISD::AND, SL, VT,
3285             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3286             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3287       }
3288     }
3289   }
3290 
3291   if (VT != MVT::i64)
3292     return SDValue();
3293 
3294   if (ShiftAmt < 32)
3295     return SDValue();
3296 
3297   // srl i64:x, C for C >= 32
3298   // =>
3299   //   build_pair (srl hi_32(x), C - 32), 0
3300   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3301 
3302   SDValue Hi = getHiHalf64(LHS, DAG);
3303 
3304   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3305   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3306 
3307   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3308 
3309   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3310 }
3311 
3312 SDValue AMDGPUTargetLowering::performTruncateCombine(
3313   SDNode *N, DAGCombinerInfo &DCI) const {
3314   SDLoc SL(N);
3315   SelectionDAG &DAG = DCI.DAG;
3316   EVT VT = N->getValueType(0);
3317   SDValue Src = N->getOperand(0);
3318 
3319   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3320   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3321     SDValue Vec = Src.getOperand(0);
3322     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3323       SDValue Elt0 = Vec.getOperand(0);
3324       EVT EltVT = Elt0.getValueType();
3325       if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3326         if (EltVT.isFloatingPoint()) {
3327           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3328                              EltVT.changeTypeToInteger(), Elt0);
3329         }
3330 
3331         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3332       }
3333     }
3334   }
3335 
3336   // Equivalent of above for accessing the high element of a vector as an
3337   // integer operation.
3338   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3339   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3340     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3341       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3342         SDValue BV = stripBitcast(Src.getOperand(0));
3343         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3344             BV.getValueType().getVectorNumElements() == 2) {
3345           SDValue SrcElt = BV.getOperand(1);
3346           EVT SrcEltVT = SrcElt.getValueType();
3347           if (SrcEltVT.isFloatingPoint()) {
3348             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3349                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3350           }
3351 
3352           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3353         }
3354       }
3355     }
3356   }
3357 
3358   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3359   //
3360   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3361   //     i16 (trunc (srl (i32 (trunc x), K)))
3362   if (VT.getScalarSizeInBits() < 32) {
3363     EVT SrcVT = Src.getValueType();
3364     if (SrcVT.getScalarSizeInBits() > 32 &&
3365         (Src.getOpcode() == ISD::SRL ||
3366          Src.getOpcode() == ISD::SRA ||
3367          Src.getOpcode() == ISD::SHL)) {
3368       SDValue Amt = Src.getOperand(1);
3369       KnownBits Known = DAG.computeKnownBits(Amt);
3370 
3371       // - For left shifts, do the transform as long as the shift
3372       //   amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3373       // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3374       //   losing information stored in the high bits when truncating.
3375       const unsigned MaxCstSize =
3376           (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3377       if (Known.getMaxValue().ule(MaxCstSize)) {
3378         EVT MidVT = VT.isVector() ?
3379           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3380                            VT.getVectorNumElements()) : MVT::i32;
3381 
3382         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3383         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3384                                     Src.getOperand(0));
3385         DCI.AddToWorklist(Trunc.getNode());
3386 
3387         if (Amt.getValueType() != NewShiftVT) {
3388           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3389           DCI.AddToWorklist(Amt.getNode());
3390         }
3391 
3392         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3393                                           Trunc, Amt);
3394         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3395       }
3396     }
3397   }
3398 
3399   return SDValue();
3400 }
3401 
3402 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3403 // instructions. If we only match on the legalized i64 mul expansion,
3404 // SimplifyDemandedBits will be unable to remove them because there will be
3405 // multiple uses due to the separate mul + mulh[su].
3406 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3407                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3408   if (Size <= 32) {
3409     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3410     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3411   }
3412 
3413   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3414   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3415 
3416   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3417   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3418 
3419   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3420 }
3421 
3422 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3423                                                 DAGCombinerInfo &DCI) const {
3424   EVT VT = N->getValueType(0);
3425 
3426   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3427   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3428   // unnecessarily). isDivergent() is used as an approximation of whether the
3429   // value is in an SGPR.
3430   if (!N->isDivergent())
3431     return SDValue();
3432 
3433   unsigned Size = VT.getSizeInBits();
3434   if (VT.isVector() || Size > 64)
3435     return SDValue();
3436 
3437   // There are i16 integer mul/mad.
3438   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3439     return SDValue();
3440 
3441   SelectionDAG &DAG = DCI.DAG;
3442   SDLoc DL(N);
3443 
3444   SDValue N0 = N->getOperand(0);
3445   SDValue N1 = N->getOperand(1);
3446 
3447   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3448   // in the source into any_extends if the result of the mul is truncated. Since
3449   // we can assume the high bits are whatever we want, use the underlying value
3450   // to avoid the unknown high bits from interfering.
3451   if (N0.getOpcode() == ISD::ANY_EXTEND)
3452     N0 = N0.getOperand(0);
3453 
3454   if (N1.getOpcode() == ISD::ANY_EXTEND)
3455     N1 = N1.getOperand(0);
3456 
3457   SDValue Mul;
3458 
3459   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3460     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3461     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3462     Mul = getMul24(DAG, DL, N0, N1, Size, false);
3463   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3464     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3465     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3466     Mul = getMul24(DAG, DL, N0, N1, Size, true);
3467   } else {
3468     return SDValue();
3469   }
3470 
3471   // We need to use sext even for MUL_U24, because MUL_U24 is used
3472   // for signed multiply of 8 and 16-bit types.
3473   return DAG.getSExtOrTrunc(Mul, DL, VT);
3474 }
3475 
3476 SDValue
3477 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3478                                             DAGCombinerInfo &DCI) const {
3479   if (N->getValueType(0) != MVT::i32)
3480     return SDValue();
3481 
3482   SelectionDAG &DAG = DCI.DAG;
3483   SDLoc DL(N);
3484 
3485   SDValue N0 = N->getOperand(0);
3486   SDValue N1 = N->getOperand(1);
3487 
3488   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3489   // in the source into any_extends if the result of the mul is truncated. Since
3490   // we can assume the high bits are whatever we want, use the underlying value
3491   // to avoid the unknown high bits from interfering.
3492   if (N0.getOpcode() == ISD::ANY_EXTEND)
3493     N0 = N0.getOperand(0);
3494   if (N1.getOpcode() == ISD::ANY_EXTEND)
3495     N1 = N1.getOperand(0);
3496 
3497   // Try to use two fast 24-bit multiplies (one for each half of the result)
3498   // instead of one slow extending multiply.
3499   unsigned LoOpcode, HiOpcode;
3500   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3501     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3502     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3503     LoOpcode = AMDGPUISD::MUL_U24;
3504     HiOpcode = AMDGPUISD::MULHI_U24;
3505   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3506     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3507     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3508     LoOpcode = AMDGPUISD::MUL_I24;
3509     HiOpcode = AMDGPUISD::MULHI_I24;
3510   } else {
3511     return SDValue();
3512   }
3513 
3514   SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3515   SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3516   DCI.CombineTo(N, Lo, Hi);
3517   return SDValue(N, 0);
3518 }
3519 
3520 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3521                                                   DAGCombinerInfo &DCI) const {
3522   EVT VT = N->getValueType(0);
3523 
3524   if (!Subtarget->hasMulI24() || VT.isVector())
3525     return SDValue();
3526 
3527   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3528   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3529   // unnecessarily). isDivergent() is used as an approximation of whether the
3530   // value is in an SGPR.
3531   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3532   // valu op anyway)
3533   if (Subtarget->hasSMulHi() && !N->isDivergent())
3534     return SDValue();
3535 
3536   SelectionDAG &DAG = DCI.DAG;
3537   SDLoc DL(N);
3538 
3539   SDValue N0 = N->getOperand(0);
3540   SDValue N1 = N->getOperand(1);
3541 
3542   if (!isI24(N0, DAG) || !isI24(N1, DAG))
3543     return SDValue();
3544 
3545   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3546   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3547 
3548   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3549   DCI.AddToWorklist(Mulhi.getNode());
3550   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3551 }
3552 
3553 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3554                                                   DAGCombinerInfo &DCI) const {
3555   EVT VT = N->getValueType(0);
3556 
3557   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3558     return SDValue();
3559 
3560   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3561   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3562   // unnecessarily). isDivergent() is used as an approximation of whether the
3563   // value is in an SGPR.
3564   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3565   // valu op anyway)
3566   if (Subtarget->hasSMulHi() && !N->isDivergent())
3567     return SDValue();
3568 
3569   SelectionDAG &DAG = DCI.DAG;
3570   SDLoc DL(N);
3571 
3572   SDValue N0 = N->getOperand(0);
3573   SDValue N1 = N->getOperand(1);
3574 
3575   if (!isU24(N0, DAG) || !isU24(N1, DAG))
3576     return SDValue();
3577 
3578   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3579   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3580 
3581   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3582   DCI.AddToWorklist(Mulhi.getNode());
3583   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3584 }
3585 
3586 static bool isNegativeOne(SDValue Val) {
3587   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3588     return C->isAllOnes();
3589   return false;
3590 }
3591 
3592 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3593                                           SDValue Op,
3594                                           const SDLoc &DL,
3595                                           unsigned Opc) const {
3596   EVT VT = Op.getValueType();
3597   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3598   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3599                               LegalVT != MVT::i16))
3600     return SDValue();
3601 
3602   if (VT != MVT::i32)
3603     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3604 
3605   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3606   if (VT != MVT::i32)
3607     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3608 
3609   return FFBX;
3610 }
3611 
3612 // The native instructions return -1 on 0 input. Optimize out a select that
3613 // produces -1 on 0.
3614 //
3615 // TODO: If zero is not undef, we could also do this if the output is compared
3616 // against the bitwidth.
3617 //
3618 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3619 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3620                                                  SDValue LHS, SDValue RHS,
3621                                                  DAGCombinerInfo &DCI) const {
3622   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3623   if (!CmpRhs || !CmpRhs->isZero())
3624     return SDValue();
3625 
3626   SelectionDAG &DAG = DCI.DAG;
3627   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3628   SDValue CmpLHS = Cond.getOperand(0);
3629 
3630   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3631   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3632   if (CCOpcode == ISD::SETEQ &&
3633       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3634       RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3635     unsigned Opc =
3636         isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3637     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3638   }
3639 
3640   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3641   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3642   if (CCOpcode == ISD::SETNE &&
3643       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3644       LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3645     unsigned Opc =
3646         isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3647 
3648     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3649   }
3650 
3651   return SDValue();
3652 }
3653 
3654 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3655                                          unsigned Op,
3656                                          const SDLoc &SL,
3657                                          SDValue Cond,
3658                                          SDValue N1,
3659                                          SDValue N2) {
3660   SelectionDAG &DAG = DCI.DAG;
3661   EVT VT = N1.getValueType();
3662 
3663   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3664                                   N1.getOperand(0), N2.getOperand(0));
3665   DCI.AddToWorklist(NewSelect.getNode());
3666   return DAG.getNode(Op, SL, VT, NewSelect);
3667 }
3668 
3669 // Pull a free FP operation out of a select so it may fold into uses.
3670 //
3671 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3672 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3673 //
3674 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3675 // select c, (fabs x), +k -> fabs (select c, x, k)
3676 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3677                                     SDValue N) {
3678   SelectionDAG &DAG = DCI.DAG;
3679   SDValue Cond = N.getOperand(0);
3680   SDValue LHS = N.getOperand(1);
3681   SDValue RHS = N.getOperand(2);
3682 
3683   EVT VT = N.getValueType();
3684   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3685       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3686     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3687                                      SDLoc(N), Cond, LHS, RHS);
3688   }
3689 
3690   bool Inv = false;
3691   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3692     std::swap(LHS, RHS);
3693     Inv = true;
3694   }
3695 
3696   // TODO: Support vector constants.
3697   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3698   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3699     SDLoc SL(N);
3700     // If one side is an fneg/fabs and the other is a constant, we can push the
3701     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3702     SDValue NewLHS = LHS.getOperand(0);
3703     SDValue NewRHS = RHS;
3704 
3705     // Careful: if the neg can be folded up, don't try to pull it back down.
3706     bool ShouldFoldNeg = true;
3707 
3708     if (NewLHS.hasOneUse()) {
3709       unsigned Opc = NewLHS.getOpcode();
3710       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3711         ShouldFoldNeg = false;
3712       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3713         ShouldFoldNeg = false;
3714     }
3715 
3716     if (ShouldFoldNeg) {
3717       if (LHS.getOpcode() == ISD::FNEG)
3718         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3719       else if (CRHS->isNegative())
3720         return SDValue();
3721 
3722       if (Inv)
3723         std::swap(NewLHS, NewRHS);
3724 
3725       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3726                                       Cond, NewLHS, NewRHS);
3727       DCI.AddToWorklist(NewSelect.getNode());
3728       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3729     }
3730   }
3731 
3732   return SDValue();
3733 }
3734 
3735 
3736 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3737                                                    DAGCombinerInfo &DCI) const {
3738   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3739     return Folded;
3740 
3741   SDValue Cond = N->getOperand(0);
3742   if (Cond.getOpcode() != ISD::SETCC)
3743     return SDValue();
3744 
3745   EVT VT = N->getValueType(0);
3746   SDValue LHS = Cond.getOperand(0);
3747   SDValue RHS = Cond.getOperand(1);
3748   SDValue CC = Cond.getOperand(2);
3749 
3750   SDValue True = N->getOperand(1);
3751   SDValue False = N->getOperand(2);
3752 
3753   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3754     SelectionDAG &DAG = DCI.DAG;
3755     if (DAG.isConstantValueOfAnyType(True) &&
3756         !DAG.isConstantValueOfAnyType(False)) {
3757       // Swap cmp + select pair to move constant to false input.
3758       // This will allow using VOPC cndmasks more often.
3759       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3760 
3761       SDLoc SL(N);
3762       ISD::CondCode NewCC =
3763           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3764 
3765       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3766       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3767     }
3768 
3769     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3770       SDValue MinMax
3771         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3772       // Revisit this node so we can catch min3/max3/med3 patterns.
3773       //DCI.AddToWorklist(MinMax.getNode());
3774       return MinMax;
3775     }
3776   }
3777 
3778   // There's no reason to not do this if the condition has other uses.
3779   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3780 }
3781 
3782 static bool isInv2Pi(const APFloat &APF) {
3783   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3784   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3785   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3786 
3787   return APF.bitwiseIsEqual(KF16) ||
3788          APF.bitwiseIsEqual(KF32) ||
3789          APF.bitwiseIsEqual(KF64);
3790 }
3791 
3792 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3793 // additional cost to negate them.
3794 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3795   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3796     if (C->isZero() && !C->isNegative())
3797       return true;
3798 
3799     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3800       return true;
3801   }
3802 
3803   return false;
3804 }
3805 
3806 static unsigned inverseMinMax(unsigned Opc) {
3807   switch (Opc) {
3808   case ISD::FMAXNUM:
3809     return ISD::FMINNUM;
3810   case ISD::FMINNUM:
3811     return ISD::FMAXNUM;
3812   case ISD::FMAXNUM_IEEE:
3813     return ISD::FMINNUM_IEEE;
3814   case ISD::FMINNUM_IEEE:
3815     return ISD::FMAXNUM_IEEE;
3816   case AMDGPUISD::FMAX_LEGACY:
3817     return AMDGPUISD::FMIN_LEGACY;
3818   case AMDGPUISD::FMIN_LEGACY:
3819     return  AMDGPUISD::FMAX_LEGACY;
3820   default:
3821     llvm_unreachable("invalid min/max opcode");
3822   }
3823 }
3824 
3825 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3826                                                  DAGCombinerInfo &DCI) const {
3827   SelectionDAG &DAG = DCI.DAG;
3828   SDValue N0 = N->getOperand(0);
3829   EVT VT = N->getValueType(0);
3830 
3831   unsigned Opc = N0.getOpcode();
3832 
3833   // If the input has multiple uses and we can either fold the negate down, or
3834   // the other uses cannot, give up. This both prevents unprofitable
3835   // transformations and infinite loops: we won't repeatedly try to fold around
3836   // a negate that has no 'good' form.
3837   if (N0.hasOneUse()) {
3838     // This may be able to fold into the source, but at a code size cost. Don't
3839     // fold if the fold into the user is free.
3840     if (allUsesHaveSourceMods(N, 0))
3841       return SDValue();
3842   } else {
3843     if (fnegFoldsIntoOp(Opc) &&
3844         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3845       return SDValue();
3846   }
3847 
3848   SDLoc SL(N);
3849   switch (Opc) {
3850   case ISD::FADD: {
3851     if (!mayIgnoreSignedZero(N0))
3852       return SDValue();
3853 
3854     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3855     SDValue LHS = N0.getOperand(0);
3856     SDValue RHS = N0.getOperand(1);
3857 
3858     if (LHS.getOpcode() != ISD::FNEG)
3859       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3860     else
3861       LHS = LHS.getOperand(0);
3862 
3863     if (RHS.getOpcode() != ISD::FNEG)
3864       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3865     else
3866       RHS = RHS.getOperand(0);
3867 
3868     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3869     if (Res.getOpcode() != ISD::FADD)
3870       return SDValue(); // Op got folded away.
3871     if (!N0.hasOneUse())
3872       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3873     return Res;
3874   }
3875   case ISD::FMUL:
3876   case AMDGPUISD::FMUL_LEGACY: {
3877     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3878     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3879     SDValue LHS = N0.getOperand(0);
3880     SDValue RHS = N0.getOperand(1);
3881 
3882     if (LHS.getOpcode() == ISD::FNEG)
3883       LHS = LHS.getOperand(0);
3884     else if (RHS.getOpcode() == ISD::FNEG)
3885       RHS = RHS.getOperand(0);
3886     else
3887       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3888 
3889     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3890     if (Res.getOpcode() != Opc)
3891       return SDValue(); // Op got folded away.
3892     if (!N0.hasOneUse())
3893       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3894     return Res;
3895   }
3896   case ISD::FMA:
3897   case ISD::FMAD: {
3898     // TODO: handle llvm.amdgcn.fma.legacy
3899     if (!mayIgnoreSignedZero(N0))
3900       return SDValue();
3901 
3902     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3903     SDValue LHS = N0.getOperand(0);
3904     SDValue MHS = N0.getOperand(1);
3905     SDValue RHS = N0.getOperand(2);
3906 
3907     if (LHS.getOpcode() == ISD::FNEG)
3908       LHS = LHS.getOperand(0);
3909     else if (MHS.getOpcode() == ISD::FNEG)
3910       MHS = MHS.getOperand(0);
3911     else
3912       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3913 
3914     if (RHS.getOpcode() != ISD::FNEG)
3915       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3916     else
3917       RHS = RHS.getOperand(0);
3918 
3919     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3920     if (Res.getOpcode() != Opc)
3921       return SDValue(); // Op got folded away.
3922     if (!N0.hasOneUse())
3923       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3924     return Res;
3925   }
3926   case ISD::FMAXNUM:
3927   case ISD::FMINNUM:
3928   case ISD::FMAXNUM_IEEE:
3929   case ISD::FMINNUM_IEEE:
3930   case AMDGPUISD::FMAX_LEGACY:
3931   case AMDGPUISD::FMIN_LEGACY: {
3932     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3933     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3934     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3935     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3936 
3937     SDValue LHS = N0.getOperand(0);
3938     SDValue RHS = N0.getOperand(1);
3939 
3940     // 0 doesn't have a negated inline immediate.
3941     // TODO: This constant check should be generalized to other operations.
3942     if (isConstantCostlierToNegate(RHS))
3943       return SDValue();
3944 
3945     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3946     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3947     unsigned Opposite = inverseMinMax(Opc);
3948 
3949     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3950     if (Res.getOpcode() != Opposite)
3951       return SDValue(); // Op got folded away.
3952     if (!N0.hasOneUse())
3953       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3954     return Res;
3955   }
3956   case AMDGPUISD::FMED3: {
3957     SDValue Ops[3];
3958     for (unsigned I = 0; I < 3; ++I)
3959       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3960 
3961     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3962     if (Res.getOpcode() != AMDGPUISD::FMED3)
3963       return SDValue(); // Op got folded away.
3964 
3965     if (!N0.hasOneUse()) {
3966       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3967       DAG.ReplaceAllUsesWith(N0, Neg);
3968 
3969       for (SDNode *U : Neg->uses())
3970         DCI.AddToWorklist(U);
3971     }
3972 
3973     return Res;
3974   }
3975   case ISD::FP_EXTEND:
3976   case ISD::FTRUNC:
3977   case ISD::FRINT:
3978   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3979   case ISD::FSIN:
3980   case ISD::FCANONICALIZE:
3981   case AMDGPUISD::RCP:
3982   case AMDGPUISD::RCP_LEGACY:
3983   case AMDGPUISD::RCP_IFLAG:
3984   case AMDGPUISD::SIN_HW: {
3985     SDValue CvtSrc = N0.getOperand(0);
3986     if (CvtSrc.getOpcode() == ISD::FNEG) {
3987       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3988       // (fneg (rcp (fneg x))) -> (rcp x)
3989       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3990     }
3991 
3992     if (!N0.hasOneUse())
3993       return SDValue();
3994 
3995     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3996     // (fneg (rcp x)) -> (rcp (fneg x))
3997     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3998     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3999   }
4000   case ISD::FP_ROUND: {
4001     SDValue CvtSrc = N0.getOperand(0);
4002 
4003     if (CvtSrc.getOpcode() == ISD::FNEG) {
4004       // (fneg (fp_round (fneg x))) -> (fp_round x)
4005       return DAG.getNode(ISD::FP_ROUND, SL, VT,
4006                          CvtSrc.getOperand(0), N0.getOperand(1));
4007     }
4008 
4009     if (!N0.hasOneUse())
4010       return SDValue();
4011 
4012     // (fneg (fp_round x)) -> (fp_round (fneg x))
4013     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4014     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4015   }
4016   case ISD::FP16_TO_FP: {
4017     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4018     // f16, but legalization of f16 fneg ends up pulling it out of the source.
4019     // Put the fneg back as a legal source operation that can be matched later.
4020     SDLoc SL(N);
4021 
4022     SDValue Src = N0.getOperand(0);
4023     EVT SrcVT = Src.getValueType();
4024 
4025     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4026     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4027                                   DAG.getConstant(0x8000, SL, SrcVT));
4028     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4029   }
4030   default:
4031     return SDValue();
4032   }
4033 }
4034 
4035 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4036                                                  DAGCombinerInfo &DCI) const {
4037   SelectionDAG &DAG = DCI.DAG;
4038   SDValue N0 = N->getOperand(0);
4039 
4040   if (!N0.hasOneUse())
4041     return SDValue();
4042 
4043   switch (N0.getOpcode()) {
4044   case ISD::FP16_TO_FP: {
4045     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4046     SDLoc SL(N);
4047     SDValue Src = N0.getOperand(0);
4048     EVT SrcVT = Src.getValueType();
4049 
4050     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4051     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4052                                   DAG.getConstant(0x7fff, SL, SrcVT));
4053     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4054   }
4055   default:
4056     return SDValue();
4057   }
4058 }
4059 
4060 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4061                                                 DAGCombinerInfo &DCI) const {
4062   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4063   if (!CFP)
4064     return SDValue();
4065 
4066   // XXX - Should this flush denormals?
4067   const APFloat &Val = CFP->getValueAPF();
4068   APFloat One(Val.getSemantics(), "1.0");
4069   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4070 }
4071 
4072 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4073                                                 DAGCombinerInfo &DCI) const {
4074   SelectionDAG &DAG = DCI.DAG;
4075   SDLoc DL(N);
4076 
4077   switch(N->getOpcode()) {
4078   default:
4079     break;
4080   case ISD::BITCAST: {
4081     EVT DestVT = N->getValueType(0);
4082 
4083     // Push casts through vector builds. This helps avoid emitting a large
4084     // number of copies when materializing floating point vector constants.
4085     //
4086     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4087     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4088     if (DestVT.isVector()) {
4089       SDValue Src = N->getOperand(0);
4090       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4091         EVT SrcVT = Src.getValueType();
4092         unsigned NElts = DestVT.getVectorNumElements();
4093 
4094         if (SrcVT.getVectorNumElements() == NElts) {
4095           EVT DestEltVT = DestVT.getVectorElementType();
4096 
4097           SmallVector<SDValue, 8> CastedElts;
4098           SDLoc SL(N);
4099           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4100             SDValue Elt = Src.getOperand(I);
4101             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4102           }
4103 
4104           return DAG.getBuildVector(DestVT, SL, CastedElts);
4105         }
4106       }
4107     }
4108 
4109     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4110       break;
4111 
4112     // Fold bitcasts of constants.
4113     //
4114     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4115     // TODO: Generalize and move to DAGCombiner
4116     SDValue Src = N->getOperand(0);
4117     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4118       SDLoc SL(N);
4119       uint64_t CVal = C->getZExtValue();
4120       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4121                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4122                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4123       return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4124     }
4125 
4126     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4127       const APInt &Val = C->getValueAPF().bitcastToAPInt();
4128       SDLoc SL(N);
4129       uint64_t CVal = Val.getZExtValue();
4130       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4131                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4132                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4133 
4134       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4135     }
4136 
4137     break;
4138   }
4139   case ISD::SHL: {
4140     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4141       break;
4142 
4143     return performShlCombine(N, DCI);
4144   }
4145   case ISD::SRL: {
4146     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4147       break;
4148 
4149     return performSrlCombine(N, DCI);
4150   }
4151   case ISD::SRA: {
4152     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4153       break;
4154 
4155     return performSraCombine(N, DCI);
4156   }
4157   case ISD::TRUNCATE:
4158     return performTruncateCombine(N, DCI);
4159   case ISD::MUL:
4160     return performMulCombine(N, DCI);
4161   case ISD::SMUL_LOHI:
4162   case ISD::UMUL_LOHI:
4163     return performMulLoHiCombine(N, DCI);
4164   case ISD::MULHS:
4165     return performMulhsCombine(N, DCI);
4166   case ISD::MULHU:
4167     return performMulhuCombine(N, DCI);
4168   case AMDGPUISD::MUL_I24:
4169   case AMDGPUISD::MUL_U24:
4170   case AMDGPUISD::MULHI_I24:
4171   case AMDGPUISD::MULHI_U24:
4172     return simplifyMul24(N, DCI);
4173   case ISD::SELECT:
4174     return performSelectCombine(N, DCI);
4175   case ISD::FNEG:
4176     return performFNegCombine(N, DCI);
4177   case ISD::FABS:
4178     return performFAbsCombine(N, DCI);
4179   case AMDGPUISD::BFE_I32:
4180   case AMDGPUISD::BFE_U32: {
4181     assert(!N->getValueType(0).isVector() &&
4182            "Vector handling of BFE not implemented");
4183     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4184     if (!Width)
4185       break;
4186 
4187     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4188     if (WidthVal == 0)
4189       return DAG.getConstant(0, DL, MVT::i32);
4190 
4191     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4192     if (!Offset)
4193       break;
4194 
4195     SDValue BitsFrom = N->getOperand(0);
4196     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4197 
4198     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4199 
4200     if (OffsetVal == 0) {
4201       // This is already sign / zero extended, so try to fold away extra BFEs.
4202       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4203 
4204       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4205       if (OpSignBits >= SignBits)
4206         return BitsFrom;
4207 
4208       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4209       if (Signed) {
4210         // This is a sign_extend_inreg. Replace it to take advantage of existing
4211         // DAG Combines. If not eliminated, we will match back to BFE during
4212         // selection.
4213 
4214         // TODO: The sext_inreg of extended types ends, although we can could
4215         // handle them in a single BFE.
4216         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4217                            DAG.getValueType(SmallVT));
4218       }
4219 
4220       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4221     }
4222 
4223     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4224       if (Signed) {
4225         return constantFoldBFE<int32_t>(DAG,
4226                                         CVal->getSExtValue(),
4227                                         OffsetVal,
4228                                         WidthVal,
4229                                         DL);
4230       }
4231 
4232       return constantFoldBFE<uint32_t>(DAG,
4233                                        CVal->getZExtValue(),
4234                                        OffsetVal,
4235                                        WidthVal,
4236                                        DL);
4237     }
4238 
4239     if ((OffsetVal + WidthVal) >= 32 &&
4240         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4241       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4242       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4243                          BitsFrom, ShiftVal);
4244     }
4245 
4246     if (BitsFrom.hasOneUse()) {
4247       APInt Demanded = APInt::getBitsSet(32,
4248                                          OffsetVal,
4249                                          OffsetVal + WidthVal);
4250 
4251       KnownBits Known;
4252       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4253                                             !DCI.isBeforeLegalizeOps());
4254       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4255       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4256           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4257         DCI.CommitTargetLoweringOpt(TLO);
4258       }
4259     }
4260 
4261     break;
4262   }
4263   case ISD::LOAD:
4264     return performLoadCombine(N, DCI);
4265   case ISD::STORE:
4266     return performStoreCombine(N, DCI);
4267   case AMDGPUISD::RCP:
4268   case AMDGPUISD::RCP_IFLAG:
4269     return performRcpCombine(N, DCI);
4270   case ISD::AssertZext:
4271   case ISD::AssertSext:
4272     return performAssertSZExtCombine(N, DCI);
4273   case ISD::INTRINSIC_WO_CHAIN:
4274     return performIntrinsicWOChainCombine(N, DCI);
4275   }
4276   return SDValue();
4277 }
4278 
4279 //===----------------------------------------------------------------------===//
4280 // Helper functions
4281 //===----------------------------------------------------------------------===//
4282 
4283 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4284                                                    const TargetRegisterClass *RC,
4285                                                    Register Reg, EVT VT,
4286                                                    const SDLoc &SL,
4287                                                    bool RawReg) const {
4288   MachineFunction &MF = DAG.getMachineFunction();
4289   MachineRegisterInfo &MRI = MF.getRegInfo();
4290   Register VReg;
4291 
4292   if (!MRI.isLiveIn(Reg)) {
4293     VReg = MRI.createVirtualRegister(RC);
4294     MRI.addLiveIn(Reg, VReg);
4295   } else {
4296     VReg = MRI.getLiveInVirtReg(Reg);
4297   }
4298 
4299   if (RawReg)
4300     return DAG.getRegister(VReg, VT);
4301 
4302   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4303 }
4304 
4305 // This may be called multiple times, and nothing prevents creating multiple
4306 // objects at the same offset. See if we already defined this object.
4307 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4308                                        int64_t Offset) {
4309   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4310     if (MFI.getObjectOffset(I) == Offset) {
4311       assert(MFI.getObjectSize(I) == Size);
4312       return I;
4313     }
4314   }
4315 
4316   return MFI.CreateFixedObject(Size, Offset, true);
4317 }
4318 
4319 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4320                                                   EVT VT,
4321                                                   const SDLoc &SL,
4322                                                   int64_t Offset) const {
4323   MachineFunction &MF = DAG.getMachineFunction();
4324   MachineFrameInfo &MFI = MF.getFrameInfo();
4325   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4326 
4327   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4328   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4329 
4330   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4331                      MachineMemOperand::MODereferenceable |
4332                          MachineMemOperand::MOInvariant);
4333 }
4334 
4335 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4336                                                    const SDLoc &SL,
4337                                                    SDValue Chain,
4338                                                    SDValue ArgVal,
4339                                                    int64_t Offset) const {
4340   MachineFunction &MF = DAG.getMachineFunction();
4341   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4342   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4343 
4344   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4345   // Stores to the argument stack area are relative to the stack pointer.
4346   SDValue SP =
4347       DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4348   Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4349   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4350                                MachineMemOperand::MODereferenceable);
4351   return Store;
4352 }
4353 
4354 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4355                                              const TargetRegisterClass *RC,
4356                                              EVT VT, const SDLoc &SL,
4357                                              const ArgDescriptor &Arg) const {
4358   assert(Arg && "Attempting to load missing argument");
4359 
4360   SDValue V = Arg.isRegister() ?
4361     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4362     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4363 
4364   if (!Arg.isMasked())
4365     return V;
4366 
4367   unsigned Mask = Arg.getMask();
4368   unsigned Shift = countTrailingZeros<unsigned>(Mask);
4369   V = DAG.getNode(ISD::SRL, SL, VT, V,
4370                   DAG.getShiftAmountConstant(Shift, VT, SL));
4371   return DAG.getNode(ISD::AND, SL, VT, V,
4372                      DAG.getConstant(Mask >> Shift, SL, VT));
4373 }
4374 
4375 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4376     const MachineFunction &MF, const ImplicitParameter Param) const {
4377   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4378   const AMDGPUSubtarget &ST =
4379       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4380   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4381   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4382   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4383                        ExplicitArgOffset;
4384   switch (Param) {
4385   case FIRST_IMPLICIT:
4386     return ArgOffset;
4387   case PRIVATE_BASE:
4388     return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4389   case SHARED_BASE:
4390     return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4391   case QUEUE_PTR:
4392     return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4393   }
4394   llvm_unreachable("unexpected implicit parameter type");
4395 }
4396 
4397 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
4398 
4399 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4400   switch ((AMDGPUISD::NodeType)Opcode) {
4401   case AMDGPUISD::FIRST_NUMBER: break;
4402   // AMDIL DAG nodes
4403   NODE_NAME_CASE(UMUL);
4404   NODE_NAME_CASE(BRANCH_COND);
4405 
4406   // AMDGPU DAG nodes
4407   NODE_NAME_CASE(IF)
4408   NODE_NAME_CASE(ELSE)
4409   NODE_NAME_CASE(LOOP)
4410   NODE_NAME_CASE(CALL)
4411   NODE_NAME_CASE(TC_RETURN)
4412   NODE_NAME_CASE(TRAP)
4413   NODE_NAME_CASE(RET_FLAG)
4414   NODE_NAME_CASE(RETURN_TO_EPILOG)
4415   NODE_NAME_CASE(ENDPGM)
4416   NODE_NAME_CASE(DWORDADDR)
4417   NODE_NAME_CASE(FRACT)
4418   NODE_NAME_CASE(SETCC)
4419   NODE_NAME_CASE(SETREG)
4420   NODE_NAME_CASE(DENORM_MODE)
4421   NODE_NAME_CASE(FMA_W_CHAIN)
4422   NODE_NAME_CASE(FMUL_W_CHAIN)
4423   NODE_NAME_CASE(CLAMP)
4424   NODE_NAME_CASE(COS_HW)
4425   NODE_NAME_CASE(SIN_HW)
4426   NODE_NAME_CASE(FMAX_LEGACY)
4427   NODE_NAME_CASE(FMIN_LEGACY)
4428   NODE_NAME_CASE(FMAX3)
4429   NODE_NAME_CASE(SMAX3)
4430   NODE_NAME_CASE(UMAX3)
4431   NODE_NAME_CASE(FMIN3)
4432   NODE_NAME_CASE(SMIN3)
4433   NODE_NAME_CASE(UMIN3)
4434   NODE_NAME_CASE(FMED3)
4435   NODE_NAME_CASE(SMED3)
4436   NODE_NAME_CASE(UMED3)
4437   NODE_NAME_CASE(FDOT2)
4438   NODE_NAME_CASE(URECIP)
4439   NODE_NAME_CASE(DIV_SCALE)
4440   NODE_NAME_CASE(DIV_FMAS)
4441   NODE_NAME_CASE(DIV_FIXUP)
4442   NODE_NAME_CASE(FMAD_FTZ)
4443   NODE_NAME_CASE(RCP)
4444   NODE_NAME_CASE(RSQ)
4445   NODE_NAME_CASE(RCP_LEGACY)
4446   NODE_NAME_CASE(RCP_IFLAG)
4447   NODE_NAME_CASE(FMUL_LEGACY)
4448   NODE_NAME_CASE(RSQ_CLAMP)
4449   NODE_NAME_CASE(LDEXP)
4450   NODE_NAME_CASE(FP_CLASS)
4451   NODE_NAME_CASE(DOT4)
4452   NODE_NAME_CASE(CARRY)
4453   NODE_NAME_CASE(BORROW)
4454   NODE_NAME_CASE(BFE_U32)
4455   NODE_NAME_CASE(BFE_I32)
4456   NODE_NAME_CASE(BFI)
4457   NODE_NAME_CASE(BFM)
4458   NODE_NAME_CASE(FFBH_U32)
4459   NODE_NAME_CASE(FFBH_I32)
4460   NODE_NAME_CASE(FFBL_B32)
4461   NODE_NAME_CASE(MUL_U24)
4462   NODE_NAME_CASE(MUL_I24)
4463   NODE_NAME_CASE(MULHI_U24)
4464   NODE_NAME_CASE(MULHI_I24)
4465   NODE_NAME_CASE(MAD_U24)
4466   NODE_NAME_CASE(MAD_I24)
4467   NODE_NAME_CASE(MAD_I64_I32)
4468   NODE_NAME_CASE(MAD_U64_U32)
4469   NODE_NAME_CASE(PERM)
4470   NODE_NAME_CASE(TEXTURE_FETCH)
4471   NODE_NAME_CASE(R600_EXPORT)
4472   NODE_NAME_CASE(CONST_ADDRESS)
4473   NODE_NAME_CASE(REGISTER_LOAD)
4474   NODE_NAME_CASE(REGISTER_STORE)
4475   NODE_NAME_CASE(SAMPLE)
4476   NODE_NAME_CASE(SAMPLEB)
4477   NODE_NAME_CASE(SAMPLED)
4478   NODE_NAME_CASE(SAMPLEL)
4479   NODE_NAME_CASE(CVT_F32_UBYTE0)
4480   NODE_NAME_CASE(CVT_F32_UBYTE1)
4481   NODE_NAME_CASE(CVT_F32_UBYTE2)
4482   NODE_NAME_CASE(CVT_F32_UBYTE3)
4483   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
4484   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
4485   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
4486   NODE_NAME_CASE(CVT_PK_I16_I32)
4487   NODE_NAME_CASE(CVT_PK_U16_U32)
4488   NODE_NAME_CASE(FP_TO_FP16)
4489   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
4490   NODE_NAME_CASE(CONST_DATA_PTR)
4491   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4492   NODE_NAME_CASE(LDS)
4493   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
4494   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
4495   NODE_NAME_CASE(DUMMY_CHAIN)
4496   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4497   NODE_NAME_CASE(LOAD_D16_HI)
4498   NODE_NAME_CASE(LOAD_D16_LO)
4499   NODE_NAME_CASE(LOAD_D16_HI_I8)
4500   NODE_NAME_CASE(LOAD_D16_HI_U8)
4501   NODE_NAME_CASE(LOAD_D16_LO_I8)
4502   NODE_NAME_CASE(LOAD_D16_LO_U8)
4503   NODE_NAME_CASE(STORE_MSKOR)
4504   NODE_NAME_CASE(LOAD_CONSTANT)
4505   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
4506   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
4507   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
4508   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
4509   NODE_NAME_CASE(DS_ORDERED_COUNT)
4510   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
4511   NODE_NAME_CASE(ATOMIC_INC)
4512   NODE_NAME_CASE(ATOMIC_DEC)
4513   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
4514   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
4515   NODE_NAME_CASE(BUFFER_LOAD)
4516   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
4517   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
4518   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
4519   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
4520   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
4521   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
4522   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4523   NODE_NAME_CASE(SBUFFER_LOAD)
4524   NODE_NAME_CASE(BUFFER_STORE)
4525   NODE_NAME_CASE(BUFFER_STORE_BYTE)
4526   NODE_NAME_CASE(BUFFER_STORE_SHORT)
4527   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
4528   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
4529   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
4530   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
4531   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
4532   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
4533   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
4534   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
4535   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
4536   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
4537   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4538   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4539   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
4540   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
4541   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4542   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
4543   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
4544   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
4545   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
4546 
4547   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4548   }
4549   return nullptr;
4550 }
4551 
4552 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4553                                               SelectionDAG &DAG, int Enabled,
4554                                               int &RefinementSteps,
4555                                               bool &UseOneConstNR,
4556                                               bool Reciprocal) const {
4557   EVT VT = Operand.getValueType();
4558 
4559   if (VT == MVT::f32) {
4560     RefinementSteps = 0;
4561     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4562   }
4563 
4564   // TODO: There is also f64 rsq instruction, but the documentation is less
4565   // clear on its precision.
4566 
4567   return SDValue();
4568 }
4569 
4570 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4571                                                SelectionDAG &DAG, int Enabled,
4572                                                int &RefinementSteps) const {
4573   EVT VT = Operand.getValueType();
4574 
4575   if (VT == MVT::f32) {
4576     // Reciprocal, < 1 ulp error.
4577     //
4578     // This reciprocal approximation converges to < 0.5 ulp error with one
4579     // newton rhapson performed with two fused multiple adds (FMAs).
4580 
4581     RefinementSteps = 0;
4582     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4583   }
4584 
4585   // TODO: There is also f64 rcp instruction, but the documentation is less
4586   // clear on its precision.
4587 
4588   return SDValue();
4589 }
4590 
4591 static unsigned workitemIntrinsicDim(unsigned ID) {
4592   switch (ID) {
4593   case Intrinsic::amdgcn_workitem_id_x:
4594     return 0;
4595   case Intrinsic::amdgcn_workitem_id_y:
4596     return 1;
4597   case Intrinsic::amdgcn_workitem_id_z:
4598     return 2;
4599   default:
4600     llvm_unreachable("not a workitem intrinsic");
4601   }
4602 }
4603 
4604 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4605     const SDValue Op, KnownBits &Known,
4606     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4607 
4608   Known.resetAll(); // Don't know anything.
4609 
4610   unsigned Opc = Op.getOpcode();
4611 
4612   switch (Opc) {
4613   default:
4614     break;
4615   case AMDGPUISD::CARRY:
4616   case AMDGPUISD::BORROW: {
4617     Known.Zero = APInt::getHighBitsSet(32, 31);
4618     break;
4619   }
4620 
4621   case AMDGPUISD::BFE_I32:
4622   case AMDGPUISD::BFE_U32: {
4623     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4624     if (!CWidth)
4625       return;
4626 
4627     uint32_t Width = CWidth->getZExtValue() & 0x1f;
4628 
4629     if (Opc == AMDGPUISD::BFE_U32)
4630       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4631 
4632     break;
4633   }
4634   case AMDGPUISD::FP_TO_FP16: {
4635     unsigned BitWidth = Known.getBitWidth();
4636 
4637     // High bits are zero.
4638     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4639     break;
4640   }
4641   case AMDGPUISD::MUL_U24:
4642   case AMDGPUISD::MUL_I24: {
4643     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4644     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4645     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4646                       RHSKnown.countMinTrailingZeros();
4647     Known.Zero.setLowBits(std::min(TrailZ, 32u));
4648     // Skip extra check if all bits are known zeros.
4649     if (TrailZ >= 32)
4650       break;
4651 
4652     // Truncate to 24 bits.
4653     LHSKnown = LHSKnown.trunc(24);
4654     RHSKnown = RHSKnown.trunc(24);
4655 
4656     if (Opc == AMDGPUISD::MUL_I24) {
4657       unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4658       unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4659       unsigned MaxValBits = LHSValBits + RHSValBits;
4660       if (MaxValBits > 32)
4661         break;
4662       unsigned SignBits = 32 - MaxValBits + 1;
4663       bool LHSNegative = LHSKnown.isNegative();
4664       bool LHSNonNegative = LHSKnown.isNonNegative();
4665       bool LHSPositive = LHSKnown.isStrictlyPositive();
4666       bool RHSNegative = RHSKnown.isNegative();
4667       bool RHSNonNegative = RHSKnown.isNonNegative();
4668       bool RHSPositive = RHSKnown.isStrictlyPositive();
4669 
4670       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4671         Known.Zero.setHighBits(SignBits);
4672       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4673         Known.One.setHighBits(SignBits);
4674     } else {
4675       unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4676       unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4677       unsigned MaxValBits = LHSValBits + RHSValBits;
4678       if (MaxValBits >= 32)
4679         break;
4680       Known.Zero.setBitsFrom(MaxValBits);
4681     }
4682     break;
4683   }
4684   case AMDGPUISD::PERM: {
4685     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4686     if (!CMask)
4687       return;
4688 
4689     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4690     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4691     unsigned Sel = CMask->getZExtValue();
4692 
4693     for (unsigned I = 0; I < 32; I += 8) {
4694       unsigned SelBits = Sel & 0xff;
4695       if (SelBits < 4) {
4696         SelBits *= 8;
4697         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4698         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4699       } else if (SelBits < 7) {
4700         SelBits = (SelBits & 3) * 8;
4701         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4702         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4703       } else if (SelBits == 0x0c) {
4704         Known.Zero |= 0xFFull << I;
4705       } else if (SelBits > 0x0c) {
4706         Known.One |= 0xFFull << I;
4707       }
4708       Sel >>= 8;
4709     }
4710     break;
4711   }
4712   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
4713     Known.Zero.setHighBits(24);
4714     break;
4715   }
4716   case AMDGPUISD::BUFFER_LOAD_USHORT: {
4717     Known.Zero.setHighBits(16);
4718     break;
4719   }
4720   case AMDGPUISD::LDS: {
4721     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4722     Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4723 
4724     Known.Zero.setHighBits(16);
4725     Known.Zero.setLowBits(Log2(Alignment));
4726     break;
4727   }
4728   case ISD::INTRINSIC_WO_CHAIN: {
4729     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4730     switch (IID) {
4731     case Intrinsic::amdgcn_mbcnt_lo:
4732     case Intrinsic::amdgcn_mbcnt_hi: {
4733       const GCNSubtarget &ST =
4734           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4735       // These return at most the (wavefront size - 1) + src1
4736       // As long as src1 is an immediate we can calc known bits
4737       KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
4738       unsigned Src1ValBits = Src1Known.countMaxActiveBits();
4739       unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
4740       // Cater for potential carry
4741       MaxActiveBits += Src1ValBits ? 1 : 0;
4742       unsigned Size = Op.getValueType().getSizeInBits();
4743       if (MaxActiveBits < Size)
4744         Known.Zero.setHighBits(Size - MaxActiveBits);
4745       break;
4746     }
4747     case Intrinsic::amdgcn_workitem_id_x:
4748     case Intrinsic::amdgcn_workitem_id_y:
4749     case Intrinsic::amdgcn_workitem_id_z: {
4750       unsigned MaxValue = Subtarget->getMaxWorkitemID(
4751           DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
4752       Known.Zero.setHighBits(countLeadingZeros(MaxValue));
4753       break;
4754     }
4755     default:
4756       break;
4757     }
4758   }
4759   }
4760 }
4761 
4762 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4763     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4764     unsigned Depth) const {
4765   switch (Op.getOpcode()) {
4766   case AMDGPUISD::BFE_I32: {
4767     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4768     if (!Width)
4769       return 1;
4770 
4771     unsigned SignBits = 32 - Width->getZExtValue() + 1;
4772     if (!isNullConstant(Op.getOperand(1)))
4773       return SignBits;
4774 
4775     // TODO: Could probably figure something out with non-0 offsets.
4776     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4777     return std::max(SignBits, Op0SignBits);
4778   }
4779 
4780   case AMDGPUISD::BFE_U32: {
4781     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4782     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4783   }
4784 
4785   case AMDGPUISD::CARRY:
4786   case AMDGPUISD::BORROW:
4787     return 31;
4788   case AMDGPUISD::BUFFER_LOAD_BYTE:
4789     return 25;
4790   case AMDGPUISD::BUFFER_LOAD_SHORT:
4791     return 17;
4792   case AMDGPUISD::BUFFER_LOAD_UBYTE:
4793     return 24;
4794   case AMDGPUISD::BUFFER_LOAD_USHORT:
4795     return 16;
4796   case AMDGPUISD::FP_TO_FP16:
4797     return 16;
4798   default:
4799     return 1;
4800   }
4801 }
4802 
4803 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4804   GISelKnownBits &Analysis, Register R,
4805   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4806   unsigned Depth) const {
4807   const MachineInstr *MI = MRI.getVRegDef(R);
4808   if (!MI)
4809     return 1;
4810 
4811   // TODO: Check range metadata on MMO.
4812   switch (MI->getOpcode()) {
4813   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4814     return 25;
4815   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4816     return 17;
4817   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4818     return 24;
4819   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4820     return 16;
4821   default:
4822     return 1;
4823   }
4824 }
4825 
4826 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4827                                                         const SelectionDAG &DAG,
4828                                                         bool SNaN,
4829                                                         unsigned Depth) const {
4830   unsigned Opcode = Op.getOpcode();
4831   switch (Opcode) {
4832   case AMDGPUISD::FMIN_LEGACY:
4833   case AMDGPUISD::FMAX_LEGACY: {
4834     if (SNaN)
4835       return true;
4836 
4837     // TODO: Can check no nans on one of the operands for each one, but which
4838     // one?
4839     return false;
4840   }
4841   case AMDGPUISD::FMUL_LEGACY:
4842   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4843     if (SNaN)
4844       return true;
4845     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4846            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4847   }
4848   case AMDGPUISD::FMED3:
4849   case AMDGPUISD::FMIN3:
4850   case AMDGPUISD::FMAX3:
4851   case AMDGPUISD::FMAD_FTZ: {
4852     if (SNaN)
4853       return true;
4854     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4855            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4856            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4857   }
4858   case AMDGPUISD::CVT_F32_UBYTE0:
4859   case AMDGPUISD::CVT_F32_UBYTE1:
4860   case AMDGPUISD::CVT_F32_UBYTE2:
4861   case AMDGPUISD::CVT_F32_UBYTE3:
4862     return true;
4863 
4864   case AMDGPUISD::RCP:
4865   case AMDGPUISD::RSQ:
4866   case AMDGPUISD::RCP_LEGACY:
4867   case AMDGPUISD::RSQ_CLAMP: {
4868     if (SNaN)
4869       return true;
4870 
4871     // TODO: Need is known positive check.
4872     return false;
4873   }
4874   case AMDGPUISD::LDEXP:
4875   case AMDGPUISD::FRACT: {
4876     if (SNaN)
4877       return true;
4878     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4879   }
4880   case AMDGPUISD::DIV_SCALE:
4881   case AMDGPUISD::DIV_FMAS:
4882   case AMDGPUISD::DIV_FIXUP:
4883     // TODO: Refine on operands.
4884     return SNaN;
4885   case AMDGPUISD::SIN_HW:
4886   case AMDGPUISD::COS_HW: {
4887     // TODO: Need check for infinity
4888     return SNaN;
4889   }
4890   case ISD::INTRINSIC_WO_CHAIN: {
4891     unsigned IntrinsicID
4892       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4893     // TODO: Handle more intrinsics
4894     switch (IntrinsicID) {
4895     case Intrinsic::amdgcn_cubeid:
4896       return true;
4897 
4898     case Intrinsic::amdgcn_frexp_mant: {
4899       if (SNaN)
4900         return true;
4901       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4902     }
4903     case Intrinsic::amdgcn_cvt_pkrtz: {
4904       if (SNaN)
4905         return true;
4906       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4907              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4908     }
4909     case Intrinsic::amdgcn_rcp:
4910     case Intrinsic::amdgcn_rsq:
4911     case Intrinsic::amdgcn_rcp_legacy:
4912     case Intrinsic::amdgcn_rsq_legacy:
4913     case Intrinsic::amdgcn_rsq_clamp: {
4914       if (SNaN)
4915         return true;
4916 
4917       // TODO: Need is known positive check.
4918       return false;
4919     }
4920     case Intrinsic::amdgcn_trig_preop:
4921     case Intrinsic::amdgcn_fdot2:
4922       // TODO: Refine on operand
4923       return SNaN;
4924     case Intrinsic::amdgcn_fma_legacy:
4925       if (SNaN)
4926         return true;
4927       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4928              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4929              DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4930     default:
4931       return false;
4932     }
4933   }
4934   default:
4935     return false;
4936   }
4937 }
4938 
4939 TargetLowering::AtomicExpansionKind
4940 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4941   switch (RMW->getOperation()) {
4942   case AtomicRMWInst::Nand:
4943   case AtomicRMWInst::FAdd:
4944   case AtomicRMWInst::FSub:
4945   case AtomicRMWInst::FMax:
4946   case AtomicRMWInst::FMin:
4947     return AtomicExpansionKind::CmpXChg;
4948   default: {
4949     if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
4950       unsigned Size = IntTy->getBitWidth();
4951       if (Size == 32 || Size == 64)
4952         return AtomicExpansionKind::None;
4953     }
4954 
4955     return AtomicExpansionKind::CmpXChg;
4956   }
4957   }
4958 }
4959 
4960 bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4961     unsigned Opc, LLT Ty1, LLT Ty2) const {
4962   return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4963          Ty2 == LLT::scalar(32);
4964 }
4965