xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/FloatingPointMode.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/UniformityAnalysis.h"
27 #include "llvm/BinaryFormat/ELF.h"
28 #include "llvm/CodeGen/Analysis.h"
29 #include "llvm/CodeGen/ByteProvider.h"
30 #include "llvm/CodeGen/FunctionLoweringInfo.h"
31 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
32 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
33 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
34 #include "llvm/CodeGen/MachineFrameInfo.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/IntrinsicsAMDGPU.h"
41 #include "llvm/IR/IntrinsicsR600.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/KnownBits.h"
44 #include "llvm/Support/ModRef.h"
45 #include <optional>
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "si-lower"
50 
51 STATISTIC(NumTailCalls, "Number of tail calls");
52 
53 static cl::opt<bool> DisableLoopAlignment(
54   "amdgpu-disable-loop-alignment",
55   cl::desc("Do not align and prefetch loops"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseDivergentRegisterIndexing(
59   "amdgpu-use-divergent-register-indexing",
60   cl::Hidden,
61   cl::desc("Use indirect register addressing for divergent indexes"),
62   cl::init(false));
63 
64 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
65   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
66   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67 }
68 
69 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
70   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71   return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72 }
73 
74 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78       return AMDGPU::SGPR0 + Reg;
79     }
80   }
81   llvm_unreachable("Cannot allocate sgpr");
82 }
83 
84 SITargetLowering::SITargetLowering(const TargetMachine &TM,
85                                    const GCNSubtarget &STI)
86     : AMDGPUTargetLowering(TM, STI),
87       Subtarget(&STI) {
88   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90 
91   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93 
94   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95 
96   const SIRegisterInfo *TRI = STI.getRegisterInfo();
97   const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98 
99   addRegisterClass(MVT::f64, V64RegClass);
100   addRegisterClass(MVT::v2f32, V64RegClass);
101   addRegisterClass(MVT::Untyped, V64RegClass);
102 
103   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104   addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105 
106   addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107   addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108 
109   addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110   addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111 
112   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113   addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114 
115   addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116   addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117 
118   addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119   addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120 
121   addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122   addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123 
124   addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125   addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126 
127   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128   addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129 
130   addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131   addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132 
133   addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134   addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135 
136   addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137   addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138 
139   addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140   addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141 
142   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143   addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144 
145   addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146   addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147 
148   addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149   addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150 
151   if (Subtarget->has16BitInsts()) {
152     if (Subtarget->useRealTrue16Insts()) {
153       addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154       addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155       addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156     } else {
157       addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158       addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159       addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160     }
161 
162     // Unless there are also VOP3P operations, not operations are really legal.
163     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165     addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168     addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171     addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173     addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174     addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175     addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176     addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177     addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178   }
179 
180   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181   addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182 
183   computeRegisterProperties(Subtarget->getRegisterInfo());
184 
185   // The boolean content concept here is too inflexible. Compares only ever
186   // really produce a 1-bit result. Any copy/extend from these will turn into a
187   // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188   // it's what most targets use.
189   setBooleanContents(ZeroOrOneBooleanContent);
190   setBooleanVectorContents(ZeroOrOneBooleanContent);
191 
192   // We need to custom lower vector stores from local memory
193   setOperationAction(ISD::LOAD,
194                      {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
195                       MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
196                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197                       MVT::i1,     MVT::v32i32},
198                      Custom);
199 
200   setOperationAction(ISD::STORE,
201                      {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
202                       MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
203                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204                       MVT::i1,     MVT::v32i32},
205                      Custom);
206 
207   if (isTypeLegal(MVT::bf16)) {
208     for (unsigned Opc :
209          {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,
210           ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,
211           ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,
212           ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,
213           ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,
214           ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,
215           ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
216           ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
217           ISD::SETCC}) {
218       // FIXME: The promoted to type shouldn't need to be explicit
219       setOperationAction(Opc, MVT::bf16, Promote);
220       AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221     }
222 
223     setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
224 
225     setOperationAction(ISD::SELECT, MVT::bf16, Promote);
226     AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227 
228     setOperationAction(ISD::FABS, MVT::bf16, Legal);
229     setOperationAction(ISD::FNEG, MVT::bf16, Legal);
230     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
231 
232     // We only need to custom lower because we can't specify an action for bf16
233     // sources.
234     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
235     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
236   }
237 
238   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
239   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
240   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
241   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
242   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
243   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
244   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
245   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
246   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
247   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
248   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
249   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
250   setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
251   setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
252   setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
253   setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
254 
255   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
256   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
257   setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
258   setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
259   setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
260   setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
261   setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
262 
263   setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
264 
265   setOperationAction(ISD::SELECT, MVT::i1, Promote);
266   setOperationAction(ISD::SELECT, MVT::i64, Custom);
267   setOperationAction(ISD::SELECT, MVT::f64, Promote);
268   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
269 
270   setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
271 
272   setOperationAction(ISD::SELECT_CC,
273                      {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
274 
275   setOperationAction(ISD::SETCC, MVT::i1, Promote);
276   setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
277   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
278 
279   setOperationAction(ISD::TRUNCATE,
280                      {MVT::v2i32,  MVT::v3i32,  MVT::v4i32,  MVT::v5i32,
281                       MVT::v6i32,  MVT::v7i32,  MVT::v8i32,  MVT::v9i32,
282                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283                      Expand);
284   setOperationAction(ISD::FP_ROUND,
285                      {MVT::v2f32,  MVT::v3f32,  MVT::v4f32,  MVT::v5f32,
286                       MVT::v6f32,  MVT::v7f32,  MVT::v8f32,  MVT::v9f32,
287                       MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288                      Expand);
289 
290   setOperationAction(ISD::SIGN_EXTEND_INREG,
291                      {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292                       MVT::v3i16, MVT::v4i16, MVT::Other},
293                      Custom);
294 
295   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
296   setOperationAction(ISD::BR_CC,
297                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
298 
299   setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
300 
301   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
302 
303   setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
304                      Expand);
305 
306 #if 0
307   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
308 #endif
309 
310   // We only support LOAD/STORE and vector manipulation ops for vectors
311   // with > 4 elements.
312   for (MVT VT :
313        {MVT::v8i32,   MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
314         MVT::v10f32,  MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315         MVT::v16i32,  MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
316         MVT::v4f16,   MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,
317         MVT::v6f32,   MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,
318         MVT::v8i16,   MVT::v8f16,  MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319         MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320         MVT::v32i16,  MVT::v32f16, MVT::v32bf16}) {
321     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322       switch (Op) {
323       case ISD::LOAD:
324       case ISD::STORE:
325       case ISD::BUILD_VECTOR:
326       case ISD::BITCAST:
327       case ISD::UNDEF:
328       case ISD::EXTRACT_VECTOR_ELT:
329       case ISD::INSERT_VECTOR_ELT:
330       case ISD::SCALAR_TO_VECTOR:
331       case ISD::IS_FPCLASS:
332         break;
333       case ISD::EXTRACT_SUBVECTOR:
334       case ISD::INSERT_SUBVECTOR:
335       case ISD::CONCAT_VECTORS:
336         setOperationAction(Op, VT, Custom);
337         break;
338       default:
339         setOperationAction(Op, VT, Expand);
340         break;
341       }
342     }
343   }
344 
345   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
346 
347   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348   // is expanded to avoid having two separate loops in case the index is a VGPR.
349 
350   // Most operations are naturally 32-bit vector operations. We only support
351   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
353     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
354     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
355 
356     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
357     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
358 
359     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
360     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
361 
362     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
363     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
364   }
365 
366   for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
367     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
368     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
369 
370     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
371     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
372 
373     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
374     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
375 
376     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
377     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
378   }
379 
380   for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
381     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
382     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
383 
384     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
385     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
386 
387     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
388     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
389 
390     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
391     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
392   }
393 
394   for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
395     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
396     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
397 
398     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
399     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
400 
401     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
402     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
403 
404     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
405     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
406   }
407 
408   for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
409     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
410     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
411 
412     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
413     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
414 
415     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
416     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
417 
418     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
419     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
420   }
421 
422   setOperationAction(ISD::VECTOR_SHUFFLE,
423                      {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424                      Expand);
425 
426   setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427                      Custom);
428 
429   // Avoid stack access for these.
430   // TODO: Generalize to more vector types.
431   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
432                      {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433                       MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434                      Custom);
435 
436   // Deal with vec3 vector operations when widened to vec4.
437   setOperationAction(ISD::INSERT_SUBVECTOR,
438                      {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
439 
440   // Deal with vec5/6/7 vector operations when widened to vec8.
441   setOperationAction(ISD::INSERT_SUBVECTOR,
442                      {MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
443                       MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
444                       MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
445                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446                      Custom);
447 
448   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449   // and output demarshalling
450   setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
451 
452   // We can't return success/failure, only the old value,
453   // let LLVM add the comparison
454   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
455                      Expand);
456 
457   setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
458 
459   setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
460 
461   // FIXME: This should be narrowed to i32, but that only happens if i64 is
462   // illegal.
463   // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464   setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
465 
466   // On SI this is s_memtime and s_memrealtime on VI.
467   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
468 
469   if (Subtarget->hasSMemRealTime() ||
470       Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
471     setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
472   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
473 
474   if (Subtarget->has16BitInsts()) {
475     setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
476     setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
477   } else {
478     setOperationAction(ISD::FSQRT, MVT::f16, Custom);
479   }
480 
481   if (Subtarget->hasMadMacF32Insts())
482     setOperationAction(ISD::FMAD, MVT::f32, Legal);
483 
484   if (!Subtarget->hasBFI())
485     // fcopysign can be done in a single instruction with BFI.
486     setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
487 
488   if (!Subtarget->hasBCNT(32))
489     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
490 
491   if (!Subtarget->hasBCNT(64))
492     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
493 
494   if (Subtarget->hasFFBH())
495     setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
496 
497   if (Subtarget->hasFFBL())
498     setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
499 
500   // We only really have 32-bit BFE instructions (and 16-bit on VI).
501   //
502   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503   // effort to match them now. We want this to be false for i64 cases when the
504   // extraction isn't restricted to the upper or lower half. Ideally we would
505   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506   // span the midpoint are probably relatively rare, so don't worry about them
507   // for now.
508   if (Subtarget->hasBFE())
509     setHasExtractBitsInsn(true);
510 
511   // Clamp modifier on add/sub
512   if (Subtarget->hasIntClamp())
513     setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
514 
515   if (Subtarget->hasAddNoCarry())
516     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
517                        Legal);
518 
519   setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
520                      Custom);
521 
522   // These are really only legal for ieee_mode functions. We should be avoiding
523   // them for functions that don't have ieee_mode enabled, so just say they are
524   // legal.
525   setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
526                      {MVT::f32, MVT::f64}, Legal);
527 
528   if (Subtarget->haveRoundOpsF64())
529     setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
530                        Legal);
531   else
532     setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
533                        MVT::f64, Custom);
534 
535   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
536   setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
537                      Legal);
538   setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
539 
540   setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
541   setOperationAction(ISD::FDIV, MVT::f64, Custom);
542 
543   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
544   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545 
546   // Custom lower these because we can't specify a rule based on an illegal
547   // source bf16.
548   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
549   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
550 
551   if (Subtarget->has16BitInsts()) {
552     setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
553                         ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
554                        MVT::i16, Legal);
555 
556     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
557 
558     setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
559                        MVT::i16, Expand);
560 
561     setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
562                         ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
563                         ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
564                         ISD::CTPOP},
565                        MVT::i16, Promote);
566 
567     setOperationAction(ISD::LOAD, MVT::i16, Custom);
568 
569     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
570 
571     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
572     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
573     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
574     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
575 
576     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
577     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
578     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
579 
580     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
581 
582     // F16 - Constant Actions.
583     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
584     setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
585 
586     // F16 - Load/Store Actions.
587     setOperationAction(ISD::LOAD, MVT::f16, Promote);
588     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
589     setOperationAction(ISD::STORE, MVT::f16, Promote);
590     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
591 
592     // BF16 - Load/Store Actions.
593     setOperationAction(ISD::LOAD, MVT::bf16, Promote);
594     AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
595     setOperationAction(ISD::STORE, MVT::bf16, Promote);
596     AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
597 
598     // F16 - VOP1 Actions.
599     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
600                         ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
601                        MVT::f16, Custom);
602 
603     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
604     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
605 
606     // F16 - VOP2 Actions.
607     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
608                        Expand);
609     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
610     setOperationAction(ISD::FFREXP, MVT::f16, Custom);
611     setOperationAction(ISD::FDIV, MVT::f16, Custom);
612 
613     // F16 - VOP3 Actions.
614     setOperationAction(ISD::FMA, MVT::f16, Legal);
615     if (STI.hasMadF16())
616       setOperationAction(ISD::FMAD, MVT::f16, Legal);
617 
618     for (MVT VT :
619          {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620           MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621           MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623         switch (Op) {
624         case ISD::LOAD:
625         case ISD::STORE:
626         case ISD::BUILD_VECTOR:
627         case ISD::BITCAST:
628         case ISD::UNDEF:
629         case ISD::EXTRACT_VECTOR_ELT:
630         case ISD::INSERT_VECTOR_ELT:
631         case ISD::INSERT_SUBVECTOR:
632         case ISD::EXTRACT_SUBVECTOR:
633         case ISD::SCALAR_TO_VECTOR:
634         case ISD::IS_FPCLASS:
635           break;
636         case ISD::CONCAT_VECTORS:
637           setOperationAction(Op, VT, Custom);
638           break;
639         default:
640           setOperationAction(Op, VT, Expand);
641           break;
642         }
643       }
644     }
645 
646     // v_perm_b32 can handle either of these.
647     setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
648     setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
649 
650     // XXX - Do these do anything? Vector constants turn into build_vector.
651     setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
652 
653     setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654                        Legal);
655 
656     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
657     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
658     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
659     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
660 
661     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
662     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
663     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
664     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
665 
666     setOperationAction(ISD::AND, MVT::v2i16, Promote);
667     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
668     setOperationAction(ISD::OR, MVT::v2i16, Promote);
669     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
670     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
671     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
672 
673     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
674     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
675     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
676     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
677     setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
678     AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
679 
680     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
681     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
682     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
683     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
684     setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
685     AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
686 
687     setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
688     AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
689     setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
690     AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
691     setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
692     AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
693 
694     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
695     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
696     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
697     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
698 
699     setOperationAction(ISD::STORE, MVT::v8i16, Promote);
700     AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
701     setOperationAction(ISD::STORE, MVT::v8f16, Promote);
702     AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
703     setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
704     AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
705 
706     setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
707     AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
708     setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
709     AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
710     setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
711     AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
712 
713     setOperationAction(ISD::STORE, MVT::v16i16, Promote);
714     AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
715     setOperationAction(ISD::STORE, MVT::v16f16, Promote);
716     AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
717     setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
718     AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
719 
720     setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
721     AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
722     setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
723     AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
724     setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
725     AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
726 
727     setOperationAction(ISD::STORE, MVT::v32i16, Promote);
728     AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
729     setOperationAction(ISD::STORE, MVT::v32f16, Promote);
730     AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
731     setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
732     AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
733 
734     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
735                        MVT::v2i32, Expand);
736     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
737 
738     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
739                        MVT::v4i32, Expand);
740 
741     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
742                        MVT::v8i32, Expand);
743 
744     setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745                        Subtarget->hasVOP3PInsts() ? Legal : Custom);
746 
747     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
748     // This isn't really legal, but this avoids the legalizer unrolling it (and
749     // allows matching fneg (fabs x) patterns)
750     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
751 
752     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
753     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
754 
755     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
756                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757                        Custom);
758 
759     setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
760                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761                        Expand);
762 
763     for (MVT Vec16 :
764          {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765           MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
766       setOperationAction(
767           {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
768           Vec16, Custom);
769       setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
770     }
771   }
772 
773   if (Subtarget->hasVOP3PInsts()) {
774     setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
775                         ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
776                         ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
777                        MVT::v2i16, Legal);
778 
779     setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
780                         ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
781                        MVT::v2f16, Legal);
782 
783     setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784                        Custom);
785 
786     setOperationAction(ISD::VECTOR_SHUFFLE,
787                        {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788                         MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789                        Custom);
790 
791     for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792       // Split vector operations.
793       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
794                           ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
795                           ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
796                           ISD::SSUBSAT},
797                          VT, Custom);
798 
799     for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800       // Split vector operations.
801       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
802                          VT, Custom);
803 
804     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
805                        Custom);
806 
807     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
808     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809                        Custom);
810 
811     if (Subtarget->hasPackedFP32Ops()) {
812       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
813                          MVT::v2f32, Legal);
814       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
815                          {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816                          Custom);
817     }
818   }
819 
820   setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
821 
822   if (Subtarget->has16BitInsts()) {
823     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
824     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
825     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
826     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
827   } else {
828     // Legalization hack.
829     setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
830 
831     setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
832   }
833 
834   setOperationAction(ISD::SELECT,
835                      {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836                       MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837                       MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838                       MVT::v32f16, MVT::v32bf16},
839                      Custom);
840 
841   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
842 
843   if (Subtarget->hasScalarSMulU64())
844     setOperationAction(ISD::MUL, MVT::i64, Custom);
845 
846   if (Subtarget->hasMad64_32())
847     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
848 
849   if (Subtarget->hasPrefetch())
850     setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
851 
852   if (Subtarget->hasIEEEMinMax()) {
853     setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
854                        {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
855     setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
856                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857                        Custom);
858   }
859 
860   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
861                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862                       MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863                       MVT::i8},
864                      Custom);
865 
866   setOperationAction(ISD::INTRINSIC_W_CHAIN,
867                      {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868                       MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869                       MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870                       MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871                      Custom);
872 
873   setOperationAction(ISD::INTRINSIC_VOID,
874                      {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875                       MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876                       MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877                       MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878                      Custom);
879 
880   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
881   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
882   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
883   setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
884   setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
885 
886   // TODO: Could move this to custom lowering, could benefit from combines on
887   // extract of relevant bits.
888   setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
889 
890   setOperationAction(ISD::MUL, MVT::i1, Promote);
891 
892   setTargetDAGCombine({ISD::ADD,
893                        ISD::UADDO_CARRY,
894                        ISD::SUB,
895                        ISD::USUBO_CARRY,
896                        ISD::FADD,
897                        ISD::FSUB,
898                        ISD::FDIV,
899                        ISD::FMINNUM,
900                        ISD::FMAXNUM,
901                        ISD::FMINNUM_IEEE,
902                        ISD::FMAXNUM_IEEE,
903                        ISD::FMINIMUM,
904                        ISD::FMAXIMUM,
905                        ISD::FMA,
906                        ISD::SMIN,
907                        ISD::SMAX,
908                        ISD::UMIN,
909                        ISD::UMAX,
910                        ISD::SETCC,
911                        ISD::AND,
912                        ISD::OR,
913                        ISD::XOR,
914                        ISD::FSHR,
915                        ISD::SINT_TO_FP,
916                        ISD::UINT_TO_FP,
917                        ISD::FCANONICALIZE,
918                        ISD::SCALAR_TO_VECTOR,
919                        ISD::ZERO_EXTEND,
920                        ISD::SIGN_EXTEND_INREG,
921                        ISD::EXTRACT_VECTOR_ELT,
922                        ISD::INSERT_VECTOR_ELT,
923                        ISD::FCOPYSIGN});
924 
925   if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
926     setTargetDAGCombine(ISD::FP_ROUND);
927 
928   // All memory operations. Some folding on the pointer operand is done to help
929   // matching the constant offsets in the addressing modes.
930   setTargetDAGCombine({ISD::LOAD,
931                        ISD::STORE,
932                        ISD::ATOMIC_LOAD,
933                        ISD::ATOMIC_STORE,
934                        ISD::ATOMIC_CMP_SWAP,
935                        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
936                        ISD::ATOMIC_SWAP,
937                        ISD::ATOMIC_LOAD_ADD,
938                        ISD::ATOMIC_LOAD_SUB,
939                        ISD::ATOMIC_LOAD_AND,
940                        ISD::ATOMIC_LOAD_OR,
941                        ISD::ATOMIC_LOAD_XOR,
942                        ISD::ATOMIC_LOAD_NAND,
943                        ISD::ATOMIC_LOAD_MIN,
944                        ISD::ATOMIC_LOAD_MAX,
945                        ISD::ATOMIC_LOAD_UMIN,
946                        ISD::ATOMIC_LOAD_UMAX,
947                        ISD::ATOMIC_LOAD_FADD,
948                        ISD::ATOMIC_LOAD_FMIN,
949                        ISD::ATOMIC_LOAD_FMAX,
950                        ISD::ATOMIC_LOAD_UINC_WRAP,
951                        ISD::ATOMIC_LOAD_UDEC_WRAP,
952                        ISD::INTRINSIC_VOID,
953                        ISD::INTRINSIC_W_CHAIN});
954 
955   // FIXME: In other contexts we pretend this is a per-function property.
956   setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
957 
958   setSchedulingPreference(Sched::RegPressure);
959 }
960 
961 const GCNSubtarget *SITargetLowering::getSubtarget() const {
962   return Subtarget;
963 }
964 
965 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
966   static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967   return RCRegs;
968 }
969 
970 //===----------------------------------------------------------------------===//
971 // TargetLowering queries
972 //===----------------------------------------------------------------------===//
973 
974 // v_mad_mix* support a conversion from f16 to f32.
975 //
976 // There is only one special case when denormals are enabled we don't currently,
977 // where this is OK to use.
978 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979                                        EVT DestVT, EVT SrcVT) const {
980   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982          DestVT.getScalarType() == MVT::f32 &&
983          SrcVT.getScalarType() == MVT::f16 &&
984          // TODO: This probably only requires no input flushing?
985          denormalModeIsFlushAllF32(DAG.getMachineFunction());
986 }
987 
988 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
989                                        LLT DestTy, LLT SrcTy) const {
990   return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991           (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992          DestTy.getScalarSizeInBits() == 32 &&
993          SrcTy.getScalarSizeInBits() == 16 &&
994          // TODO: This probably only requires no input flushing?
995          denormalModeIsFlushAllF32(*MI.getMF());
996 }
997 
998 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
999   // SI has some legal vector types, but no legal vector operations. Say no
1000   // shuffles are legal in order to prefer scalarizing some vector operations.
1001   return false;
1002 }
1003 
1004 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1005                                                     CallingConv::ID CC,
1006                                                     EVT VT) const {
1007   if (CC == CallingConv::AMDGPU_KERNEL)
1008     return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1009 
1010   if (VT.isVector()) {
1011     EVT ScalarVT = VT.getScalarType();
1012     unsigned Size = ScalarVT.getSizeInBits();
1013     if (Size == 16) {
1014       if (Subtarget->has16BitInsts()) {
1015         if (VT.isInteger())
1016           return MVT::v2i16;
1017         return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018       }
1019       return VT.isInteger() ? MVT::i32 : MVT::f32;
1020     }
1021 
1022     if (Size < 16)
1023       return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024     return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025   }
1026 
1027   if (VT.getSizeInBits() > 32)
1028     return MVT::i32;
1029 
1030   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1031 }
1032 
1033 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1034                                                          CallingConv::ID CC,
1035                                                          EVT VT) const {
1036   if (CC == CallingConv::AMDGPU_KERNEL)
1037     return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1038 
1039   if (VT.isVector()) {
1040     unsigned NumElts = VT.getVectorNumElements();
1041     EVT ScalarVT = VT.getScalarType();
1042     unsigned Size = ScalarVT.getSizeInBits();
1043 
1044     // FIXME: Should probably promote 8-bit vectors to i16.
1045     if (Size == 16 && Subtarget->has16BitInsts())
1046       return (NumElts + 1) / 2;
1047 
1048     if (Size <= 32)
1049       return NumElts;
1050 
1051     if (Size > 32)
1052       return NumElts * ((Size + 31) / 32);
1053   } else if (VT.getSizeInBits() > 32)
1054     return (VT.getSizeInBits() + 31) / 32;
1055 
1056   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1057 }
1058 
1059 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1060   LLVMContext &Context, CallingConv::ID CC,
1061   EVT VT, EVT &IntermediateVT,
1062   unsigned &NumIntermediates, MVT &RegisterVT) const {
1063   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064     unsigned NumElts = VT.getVectorNumElements();
1065     EVT ScalarVT = VT.getScalarType();
1066     unsigned Size = ScalarVT.getSizeInBits();
1067     // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068     // support, but unless we can properly handle 3-vectors, it will be still be
1069     // inconsistent.
1070     if (Size == 16 && Subtarget->has16BitInsts()) {
1071       if (ScalarVT == MVT::bf16) {
1072         RegisterVT = MVT::i32;
1073         IntermediateVT = MVT::v2bf16;
1074       } else {
1075         RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076         IntermediateVT = RegisterVT;
1077       }
1078       NumIntermediates = (NumElts + 1) / 2;
1079       return NumIntermediates;
1080     }
1081 
1082     if (Size == 32) {
1083       RegisterVT = ScalarVT.getSimpleVT();
1084       IntermediateVT = RegisterVT;
1085       NumIntermediates = NumElts;
1086       return NumIntermediates;
1087     }
1088 
1089     if (Size < 16 && Subtarget->has16BitInsts()) {
1090       // FIXME: Should probably form v2i16 pieces
1091       RegisterVT = MVT::i16;
1092       IntermediateVT = ScalarVT;
1093       NumIntermediates = NumElts;
1094       return NumIntermediates;
1095     }
1096 
1097 
1098     if (Size != 16 && Size <= 32) {
1099       RegisterVT = MVT::i32;
1100       IntermediateVT = ScalarVT;
1101       NumIntermediates = NumElts;
1102       return NumIntermediates;
1103     }
1104 
1105     if (Size > 32) {
1106       RegisterVT = MVT::i32;
1107       IntermediateVT = RegisterVT;
1108       NumIntermediates = NumElts * ((Size + 31) / 32);
1109       return NumIntermediates;
1110     }
1111   }
1112 
1113   return TargetLowering::getVectorTypeBreakdownForCallingConv(
1114     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115 }
1116 
1117 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1118                                  const DataLayout &DL, Type *Ty,
1119                                  unsigned MaxNumLanes) {
1120   assert(MaxNumLanes != 0);
1121 
1122   LLVMContext &Ctx = Ty->getContext();
1123   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124     unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1125     return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1126                             NumElts);
1127   }
1128 
1129   return TLI.getValueType(DL, Ty);
1130 }
1131 
1132 // Peek through TFE struct returns to only use the data size.
1133 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1134                                    const DataLayout &DL, Type *Ty,
1135                                    unsigned MaxNumLanes) {
1136   auto *ST = dyn_cast<StructType>(Ty);
1137   if (!ST)
1138     return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139 
1140   // TFE intrinsics return an aggregate type.
1141   assert(ST->getNumContainedTypes() == 2 &&
1142          ST->getContainedType(1)->isIntegerTy(32));
1143   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1144 }
1145 
1146 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1147 /// representation. This return value is vector-typed because there is no
1148 /// MVT::i160 and it is not clear if one can be added. While this could
1149 /// cause issues during codegen, these address space 7 pointers will be
1150 /// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152 /// modeling, to work.
1153 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1154   if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155     return MVT::v5i32;
1156   if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1157       DL.getPointerSizeInBits(AS) == 192)
1158     return MVT::v6i32;
1159   return AMDGPUTargetLowering::getPointerTy(DL, AS);
1160 }
1161 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162 /// v8i32 when padding is added.
1163 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164 /// also v8i32 with padding.
1165 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1166   if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167        DL.getPointerSizeInBits(AS) == 160) ||
1168       (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1169        DL.getPointerSizeInBits(AS) == 192))
1170     return MVT::v8i32;
1171   return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1172 }
1173 
1174 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1175                                           const CallInst &CI,
1176                                           MachineFunction &MF,
1177                                           unsigned IntrID) const {
1178   Info.flags = MachineMemOperand::MONone;
1179   if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1180     Info.flags |= MachineMemOperand::MOInvariant;
1181 
1182   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1183           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1184     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
1185                                                   (Intrinsic::ID)IntrID);
1186     MemoryEffects ME = Attr.getMemoryEffects();
1187     if (ME.doesNotAccessMemory())
1188       return false;
1189 
1190     // TODO: Should images get their own address space?
1191     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192 
1193     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1194     if (RsrcIntr->IsImage) {
1195       const AMDGPU::ImageDimIntrinsicInfo *Intr =
1196           AMDGPU::getImageDimIntrinsicInfo(IntrID);
1197       BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1198       Info.align.reset();
1199     }
1200 
1201     Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1202     if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1203       if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1204         // We conservatively set the memory operand of a buffer intrinsic to the
1205         // base resource pointer, so that we can access alias information about
1206         // those pointers. Cases like "this points at the same value
1207         // but with a different offset" are handled in
1208         // areMemAccessesTriviallyDisjoint.
1209         Info.ptrVal = RsrcArg;
1210     }
1211 
1212     auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1213     if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1214       Info.flags |= MachineMemOperand::MOVolatile;
1215     Info.flags |= MachineMemOperand::MODereferenceable;
1216     if (ME.onlyReadsMemory()) {
1217       if (RsrcIntr->IsImage) {
1218         unsigned MaxNumLanes = 4;
1219 
1220         if (!BaseOpcode->Gather4) {
1221           // If this isn't a gather, we may have excess loaded elements in the
1222           // IR type. Check the dmask for the real number of elements loaded.
1223           unsigned DMask
1224             = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1225           MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1226         }
1227 
1228         Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1229                                              CI.getType(), MaxNumLanes);
1230       } else {
1231         Info.memVT =
1232             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1233                                     std::numeric_limits<unsigned>::max());
1234       }
1235 
1236       // FIXME: What does alignment mean for an image?
1237       Info.opc = ISD::INTRINSIC_W_CHAIN;
1238       Info.flags |= MachineMemOperand::MOLoad;
1239     } else if (ME.onlyWritesMemory()) {
1240       Info.opc = ISD::INTRINSIC_VOID;
1241 
1242       Type *DataTy = CI.getArgOperand(0)->getType();
1243       if (RsrcIntr->IsImage) {
1244         unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1245         unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1246         Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1247                                            DMaskLanes);
1248       } else
1249         Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1250 
1251       Info.flags |= MachineMemOperand::MOStore;
1252     } else {
1253       // Atomic or NoReturn Sampler
1254       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1255                                             ISD::INTRINSIC_W_CHAIN;
1256       Info.flags |= MachineMemOperand::MOLoad |
1257                     MachineMemOperand::MOStore |
1258                     MachineMemOperand::MODereferenceable;
1259 
1260       switch (IntrID) {
1261       default:
1262         if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1263           // Fake memory access type for no return sampler intrinsics
1264           Info.memVT = MVT::i32;
1265         } else {
1266           // XXX - Should this be volatile without known ordering?
1267           Info.flags |= MachineMemOperand::MOVolatile;
1268           Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1269         }
1270         break;
1271       case Intrinsic::amdgcn_raw_buffer_load_lds:
1272       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1273       case Intrinsic::amdgcn_struct_buffer_load_lds:
1274       case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1275         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1276         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1277         Info.ptrVal = CI.getArgOperand(1);
1278         return true;
1279       }
1280       case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281       case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
1282         Info.memVT =
1283             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1284                                     std::numeric_limits<unsigned>::max());
1285         Info.flags &= ~MachineMemOperand::MOStore;
1286         return true;
1287       }
1288       }
1289     }
1290     return true;
1291   }
1292 
1293   switch (IntrID) {
1294   case Intrinsic::amdgcn_ds_ordered_add:
1295   case Intrinsic::amdgcn_ds_ordered_swap: {
1296     Info.opc = ISD::INTRINSIC_W_CHAIN;
1297     Info.memVT = MVT::getVT(CI.getType());
1298     Info.ptrVal = CI.getOperand(0);
1299     Info.align.reset();
1300     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1301 
1302     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1303     if (!Vol->isZero())
1304       Info.flags |= MachineMemOperand::MOVolatile;
1305 
1306     return true;
1307   }
1308   case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1309   case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1310     Info.opc = ISD::INTRINSIC_W_CHAIN;
1311     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1312     Info.ptrVal = nullptr;
1313     Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1314     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1315     return true;
1316   }
1317   case Intrinsic::amdgcn_ds_append:
1318   case Intrinsic::amdgcn_ds_consume: {
1319     Info.opc = ISD::INTRINSIC_W_CHAIN;
1320     Info.memVT = MVT::getVT(CI.getType());
1321     Info.ptrVal = CI.getOperand(0);
1322     Info.align.reset();
1323     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1324 
1325     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1326     if (!Vol->isZero())
1327       Info.flags |= MachineMemOperand::MOVolatile;
1328 
1329     return true;
1330   }
1331   case Intrinsic::amdgcn_global_atomic_csub: {
1332     Info.opc = ISD::INTRINSIC_W_CHAIN;
1333     Info.memVT = MVT::getVT(CI.getType());
1334     Info.ptrVal = CI.getOperand(0);
1335     Info.align.reset();
1336     Info.flags |= MachineMemOperand::MOLoad |
1337                   MachineMemOperand::MOStore |
1338                   MachineMemOperand::MOVolatile;
1339     return true;
1340   }
1341   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1342     Info.opc = ISD::INTRINSIC_W_CHAIN;
1343     Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1344 
1345     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1346     Info.align.reset();
1347     Info.flags |= MachineMemOperand::MOLoad |
1348                   MachineMemOperand::MODereferenceable;
1349     return true;
1350   }
1351   case Intrinsic::amdgcn_global_atomic_fadd:
1352   case Intrinsic::amdgcn_global_atomic_fmin:
1353   case Intrinsic::amdgcn_global_atomic_fmax:
1354   case Intrinsic::amdgcn_global_atomic_fmin_num:
1355   case Intrinsic::amdgcn_global_atomic_fmax_num:
1356   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1357   case Intrinsic::amdgcn_flat_atomic_fadd:
1358   case Intrinsic::amdgcn_flat_atomic_fmin:
1359   case Intrinsic::amdgcn_flat_atomic_fmax:
1360   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1361   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1362   case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1363   case Intrinsic::amdgcn_atomic_cond_sub_u32:
1364   case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1365     Info.opc = ISD::INTRINSIC_W_CHAIN;
1366     Info.memVT = MVT::getVT(CI.getType());
1367     Info.ptrVal = CI.getOperand(0);
1368     Info.align.reset();
1369     Info.flags |= MachineMemOperand::MOLoad |
1370                   MachineMemOperand::MOStore |
1371                   MachineMemOperand::MODereferenceable |
1372                   MachineMemOperand::MOVolatile;
1373     return true;
1374   }
1375   case Intrinsic::amdgcn_global_load_tr_b64:
1376   case Intrinsic::amdgcn_global_load_tr_b128: {
1377     Info.opc = ISD::INTRINSIC_W_CHAIN;
1378     Info.memVT = MVT::getVT(CI.getType());
1379     Info.ptrVal = CI.getOperand(0);
1380     Info.align.reset();
1381     Info.flags |= MachineMemOperand::MOLoad;
1382     return true;
1383   }
1384   case Intrinsic::amdgcn_ds_gws_init:
1385   case Intrinsic::amdgcn_ds_gws_barrier:
1386   case Intrinsic::amdgcn_ds_gws_sema_v:
1387   case Intrinsic::amdgcn_ds_gws_sema_br:
1388   case Intrinsic::amdgcn_ds_gws_sema_p:
1389   case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1390     Info.opc = ISD::INTRINSIC_VOID;
1391 
1392     const GCNTargetMachine &TM =
1393         static_cast<const GCNTargetMachine &>(getTargetMachine());
1394 
1395     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1396     Info.ptrVal = MFI->getGWSPSV(TM);
1397 
1398     // This is an abstract access, but we need to specify a type and size.
1399     Info.memVT = MVT::i32;
1400     Info.size = 4;
1401     Info.align = Align(4);
1402 
1403     if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1404       Info.flags |= MachineMemOperand::MOLoad;
1405     else
1406       Info.flags |= MachineMemOperand::MOStore;
1407     return true;
1408   }
1409   case Intrinsic::amdgcn_global_load_lds: {
1410     Info.opc = ISD::INTRINSIC_VOID;
1411     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1412     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1413     Info.ptrVal = CI.getArgOperand(1);
1414     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1415     return true;
1416   }
1417   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1418     Info.opc = ISD::INTRINSIC_W_CHAIN;
1419 
1420     const GCNTargetMachine &TM =
1421         static_cast<const GCNTargetMachine &>(getTargetMachine());
1422 
1423     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1424     Info.ptrVal = MFI->getGWSPSV(TM);
1425 
1426     // This is an abstract access, but we need to specify a type and size.
1427     Info.memVT = MVT::i32;
1428     Info.size = 4;
1429     Info.align = Align(4);
1430 
1431     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1432     return true;
1433   }
1434   default:
1435     return false;
1436   }
1437 }
1438 
1439 void SITargetLowering::CollectTargetIntrinsicOperands(
1440     const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1441   switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1442   case Intrinsic::amdgcn_addrspacecast_nonnull: {
1443     // The DAG's ValueType loses the addrspaces.
1444     // Add them as 2 extra Constant operands "from" and "to".
1445     unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1446     unsigned DstAS = I.getType()->getPointerAddressSpace();
1447     Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1448     Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1449     break;
1450   }
1451   default:
1452     break;
1453   }
1454 }
1455 
1456 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1457                                             SmallVectorImpl<Value*> &Ops,
1458                                             Type *&AccessTy) const {
1459   Value *Ptr = nullptr;
1460   switch (II->getIntrinsicID()) {
1461   case Intrinsic::amdgcn_atomic_cond_sub_u32:
1462   case Intrinsic::amdgcn_ds_append:
1463   case Intrinsic::amdgcn_ds_consume:
1464   case Intrinsic::amdgcn_ds_ordered_add:
1465   case Intrinsic::amdgcn_ds_ordered_swap:
1466   case Intrinsic::amdgcn_flat_atomic_fadd:
1467   case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1468   case Intrinsic::amdgcn_flat_atomic_fmax:
1469   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1470   case Intrinsic::amdgcn_flat_atomic_fmin:
1471   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1472   case Intrinsic::amdgcn_global_atomic_csub:
1473   case Intrinsic::amdgcn_global_atomic_fadd:
1474   case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1475   case Intrinsic::amdgcn_global_atomic_fmax:
1476   case Intrinsic::amdgcn_global_atomic_fmax_num:
1477   case Intrinsic::amdgcn_global_atomic_fmin:
1478   case Intrinsic::amdgcn_global_atomic_fmin_num:
1479   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1480   case Intrinsic::amdgcn_global_load_tr_b64:
1481   case Intrinsic::amdgcn_global_load_tr_b128:
1482     Ptr = II->getArgOperand(0);
1483     break;
1484   case Intrinsic::amdgcn_global_load_lds:
1485     Ptr = II->getArgOperand(1);
1486     break;
1487   default:
1488     return false;
1489   }
1490   AccessTy = II->getType();
1491   Ops.push_back(Ptr);
1492   return true;
1493 }
1494 
1495 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1496                                                  unsigned AddrSpace) const {
1497   if (!Subtarget->hasFlatInstOffsets()) {
1498     // Flat instructions do not have offsets, and only have the register
1499     // address.
1500     return AM.BaseOffs == 0 && AM.Scale == 0;
1501   }
1502 
1503   decltype(SIInstrFlags::FLAT) FlatVariant =
1504       AddrSpace == AMDGPUAS::GLOBAL_ADDRESS    ? SIInstrFlags::FlatGlobal
1505       : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1506                                                : SIInstrFlags::FLAT;
1507 
1508   return AM.Scale == 0 &&
1509          (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1510                                   AM.BaseOffs, AddrSpace, FlatVariant));
1511 }
1512 
1513 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1514   if (Subtarget->hasFlatGlobalInsts())
1515     return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS);
1516 
1517   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1518     // Assume the we will use FLAT for all global memory accesses
1519     // on VI.
1520     // FIXME: This assumption is currently wrong.  On VI we still use
1521     // MUBUF instructions for the r + i addressing mode.  As currently
1522     // implemented, the MUBUF instructions only work on buffer < 4GB.
1523     // It may be possible to support > 4GB buffers with MUBUF instructions,
1524     // by setting the stride value in the resource descriptor which would
1525     // increase the size limit to (stride * 4GB).  However, this is risky,
1526     // because it has never been validated.
1527     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1528   }
1529 
1530   return isLegalMUBUFAddressingMode(AM);
1531 }
1532 
1533 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1534   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1535   // additionally can do r + r + i with addr64. 32-bit has more addressing
1536   // mode options. Depending on the resource constant, it can also do
1537   // (i64 r0) + (i32 r1) * (i14 i).
1538   //
1539   // Private arrays end up using a scratch buffer most of the time, so also
1540   // assume those use MUBUF instructions. Scratch loads / stores are currently
1541   // implemented as mubuf instructions with offen bit set, so slightly
1542   // different than the normal addr64.
1543   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1544   if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1545     return false;
1546 
1547   // FIXME: Since we can split immediate into soffset and immediate offset,
1548   // would it make sense to allow any immediate?
1549 
1550   switch (AM.Scale) {
1551   case 0: // r + i or just i, depending on HasBaseReg.
1552     return true;
1553   case 1:
1554     return true; // We have r + r or r + i.
1555   case 2:
1556     if (AM.HasBaseReg) {
1557       // Reject 2 * r + r.
1558       return false;
1559     }
1560 
1561     // Allow 2 * r as r + r
1562     // Or  2 * r + i is allowed as r + r + i.
1563     return true;
1564   default: // Don't allow n * r
1565     return false;
1566   }
1567 }
1568 
1569 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1570                                              const AddrMode &AM, Type *Ty,
1571                                              unsigned AS, Instruction *I) const {
1572   // No global is ever allowed as a base.
1573   if (AM.BaseGV)
1574     return false;
1575 
1576   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1577     return isLegalGlobalAddressingMode(AM);
1578 
1579   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1580       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1581       AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1582       AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1583     // If the offset isn't a multiple of 4, it probably isn't going to be
1584     // correctly aligned.
1585     // FIXME: Can we get the real alignment here?
1586     if (AM.BaseOffs % 4 != 0)
1587       return isLegalMUBUFAddressingMode(AM);
1588 
1589     if (!Subtarget->hasScalarSubwordLoads()) {
1590       // There are no SMRD extloads, so if we have to do a small type access we
1591       // will use a MUBUF load.
1592       // FIXME?: We also need to do this if unaligned, but we don't know the
1593       // alignment here.
1594       if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1595         return isLegalGlobalAddressingMode(AM);
1596     }
1597 
1598     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1599       // SMRD instructions have an 8-bit, dword offset on SI.
1600       if (!isUInt<8>(AM.BaseOffs / 4))
1601         return false;
1602     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1603       // On CI+, this can also be a 32-bit literal constant offset. If it fits
1604       // in 8-bits, it can use a smaller encoding.
1605       if (!isUInt<32>(AM.BaseOffs / 4))
1606         return false;
1607     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1608       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1609       if (!isUInt<20>(AM.BaseOffs))
1610         return false;
1611     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1612       // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1613       // for S_BUFFER_* instructions).
1614       if (!isInt<21>(AM.BaseOffs))
1615         return false;
1616     } else {
1617       // On GFX12, all offsets are signed 24-bit in bytes.
1618       if (!isInt<24>(AM.BaseOffs))
1619         return false;
1620     }
1621 
1622     if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1623          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1624         AM.BaseOffs < 0) {
1625       // Scalar (non-buffer) loads can only use a negative offset if
1626       // soffset+offset is non-negative. Since the compiler can only prove that
1627       // in a few special cases, it is safer to claim that negative offsets are
1628       // not supported.
1629       return false;
1630     }
1631 
1632     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1633       return true;
1634 
1635     if (AM.Scale == 1 && AM.HasBaseReg)
1636       return true;
1637 
1638     return false;
1639   }
1640 
1641   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1642     return Subtarget->enableFlatScratch()
1643                ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
1644                : isLegalMUBUFAddressingMode(AM);
1645 
1646   if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1647       (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1648     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1649     // field.
1650     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1651     // an 8-bit dword offset but we don't know the alignment here.
1652     if (!isUInt<16>(AM.BaseOffs))
1653       return false;
1654 
1655     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1656       return true;
1657 
1658     if (AM.Scale == 1 && AM.HasBaseReg)
1659       return true;
1660 
1661     return false;
1662   }
1663 
1664   if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1665     // For an unknown address space, this usually means that this is for some
1666     // reason being used for pure arithmetic, and not based on some addressing
1667     // computation. We don't have instructions that compute pointers with any
1668     // addressing modes, so treat them as having no offset like flat
1669     // instructions.
1670     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1671   }
1672 
1673   // Assume a user alias of global for unknown address spaces.
1674   return isLegalGlobalAddressingMode(AM);
1675 }
1676 
1677 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1678                                         const MachineFunction &MF) const {
1679   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1680     return (MemVT.getSizeInBits() <= 4 * 32);
1681   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1682     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1683     return (MemVT.getSizeInBits() <= MaxPrivateBits);
1684   }
1685   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1686     return (MemVT.getSizeInBits() <= 2 * 32);
1687   return true;
1688 }
1689 
1690 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1691     unsigned Size, unsigned AddrSpace, Align Alignment,
1692     MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1693   if (IsFast)
1694     *IsFast = 0;
1695 
1696   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1697       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1698     // Check if alignment requirements for ds_read/write instructions are
1699     // disabled.
1700     if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1701       return false;
1702 
1703     Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1704     if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1705         Alignment < RequiredAlignment)
1706       return false;
1707 
1708     // Either, the alignment requirements are "enabled", or there is an
1709     // unaligned LDS access related hardware bug though alignment requirements
1710     // are "disabled". In either case, we need to check for proper alignment
1711     // requirements.
1712     //
1713     switch (Size) {
1714     case 64:
1715       // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1716       // address is negative, then the instruction is incorrectly treated as
1717       // out-of-bounds even if base + offsets is in bounds. Split vectorized
1718       // loads here to avoid emitting ds_read2_b32. We may re-combine the
1719       // load later in the SILoadStoreOptimizer.
1720       if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1721         return false;
1722 
1723       // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1724       // can do a 4 byte aligned, 8 byte access in a single operation using
1725       // ds_read2/write2_b32 with adjacent offsets.
1726       RequiredAlignment = Align(4);
1727 
1728       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1729         // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1730         // ds_write2_b32 depending on the alignment. In either case with either
1731         // alignment there is no faster way of doing this.
1732 
1733         // The numbers returned here and below are not additive, it is a 'speed
1734         // rank'. They are just meant to be compared to decide if a certain way
1735         // of lowering an operation is faster than another. For that purpose
1736         // naturally aligned operation gets it bitsize to indicate that "it
1737         // operates with a speed comparable to N-bit wide load". With the full
1738         // alignment ds128 is slower than ds96 for example. If underaligned it
1739         // is comparable to a speed of a single dword access, which would then
1740         // mean 32 < 128 and it is faster to issue a wide load regardless.
1741         // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1742         // wider load which will not be aligned anymore the latter is slower.
1743         if (IsFast)
1744           *IsFast = (Alignment >= RequiredAlignment) ? 64
1745                     : (Alignment < Align(4))         ? 32
1746                                                      : 1;
1747         return true;
1748       }
1749 
1750       break;
1751     case 96:
1752       if (!Subtarget->hasDS96AndDS128())
1753         return false;
1754 
1755       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1756       // gfx8 and older.
1757 
1758       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759         // Naturally aligned access is fastest. However, also report it is Fast
1760         // if memory is aligned less than DWORD. A narrow load or store will be
1761         // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1762         // be more of them, so overall we will pay less penalty issuing a single
1763         // instruction.
1764 
1765         // See comment on the values above.
1766         if (IsFast)
1767           *IsFast = (Alignment >= RequiredAlignment) ? 96
1768                     : (Alignment < Align(4))         ? 32
1769                                                      : 1;
1770         return true;
1771       }
1772 
1773       break;
1774     case 128:
1775       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1776         return false;
1777 
1778       // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1779       // gfx8 and older, but  we can do a 8 byte aligned, 16 byte access in a
1780       // single operation using ds_read2/write2_b64.
1781       RequiredAlignment = Align(8);
1782 
1783       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1784         // Naturally aligned access is fastest. However, also report it is Fast
1785         // if memory is aligned less than DWORD. A narrow load or store will be
1786         // be equally slow as a single ds_read_b128/ds_write_b128, but there
1787         // will be more of them, so overall we will pay less penalty issuing a
1788         // single instruction.
1789 
1790         // See comment on the values above.
1791         if (IsFast)
1792           *IsFast = (Alignment >= RequiredAlignment) ? 128
1793                     : (Alignment < Align(4))         ? 32
1794                                                      : 1;
1795         return true;
1796       }
1797 
1798       break;
1799     default:
1800       if (Size > 32)
1801         return false;
1802 
1803       break;
1804     }
1805 
1806     // See comment on the values above.
1807     // Note that we have a single-dword or sub-dword here, so if underaligned
1808     // it is a slowest possible access, hence returned value is 0.
1809     if (IsFast)
1810       *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1811 
1812     return Alignment >= RequiredAlignment ||
1813            Subtarget->hasUnalignedDSAccessEnabled();
1814   }
1815 
1816   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1817     bool AlignedBy4 = Alignment >= Align(4);
1818     if (IsFast)
1819       *IsFast = AlignedBy4;
1820 
1821     return AlignedBy4 ||
1822            Subtarget->enableFlatScratch() ||
1823            Subtarget->hasUnalignedScratchAccess();
1824   }
1825 
1826   // FIXME: We have to be conservative here and assume that flat operations
1827   // will access scratch.  If we had access to the IR function, then we
1828   // could determine if any private memory was used in the function.
1829   if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1830       !Subtarget->hasUnalignedScratchAccess()) {
1831     bool AlignedBy4 = Alignment >= Align(4);
1832     if (IsFast)
1833       *IsFast = AlignedBy4;
1834 
1835     return AlignedBy4;
1836   }
1837 
1838   // So long as they are correct, wide global memory operations perform better
1839   // than multiple smaller memory ops -- even when misaligned
1840   if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1841     if (IsFast)
1842       *IsFast = Size;
1843 
1844     return Alignment >= Align(4) ||
1845            Subtarget->hasUnalignedBufferAccessEnabled();
1846   }
1847 
1848   // Smaller than dword value must be aligned.
1849   if (Size < 32)
1850     return false;
1851 
1852   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853   // byte-address are ignored, thus forcing Dword alignment.
1854   // This applies to private, global, and constant memory.
1855   if (IsFast)
1856     *IsFast = 1;
1857 
1858   return Size >= 32 && Alignment >= Align(4);
1859 }
1860 
1861 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1862     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1863     unsigned *IsFast) const {
1864   return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1865                                             Alignment, Flags, IsFast);
1866 }
1867 
1868 EVT SITargetLowering::getOptimalMemOpType(
1869     const MemOp &Op, const AttributeList &FuncAttributes) const {
1870   // FIXME: Should account for address space here.
1871 
1872   // The default fallback uses the private pointer size as a guess for a type to
1873   // use. Make sure we switch these to 64-bit accesses.
1874 
1875   if (Op.size() >= 16 &&
1876       Op.isDstAligned(Align(4))) // XXX: Should only do for global
1877     return MVT::v4i32;
1878 
1879   if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1880     return MVT::v2i32;
1881 
1882   // Use the default.
1883   return MVT::Other;
1884 }
1885 
1886 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1887   const MemSDNode *MemNode = cast<MemSDNode>(N);
1888   return MemNode->getMemOperand()->getFlags() & MONoClobber;
1889 }
1890 
1891 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1892   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1893          AS == AMDGPUAS::PRIVATE_ADDRESS;
1894 }
1895 
1896 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1897                                            unsigned DestAS) const {
1898   // Flat -> private/local is a simple truncate.
1899   // Flat -> global is no-op
1900   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1901     return true;
1902 
1903   const GCNTargetMachine &TM =
1904       static_cast<const GCNTargetMachine &>(getTargetMachine());
1905   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1906 }
1907 
1908 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1909   const MemSDNode *MemNode = cast<MemSDNode>(N);
1910 
1911   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1912 }
1913 
1914 TargetLoweringBase::LegalizeTypeAction
1915 SITargetLowering::getPreferredVectorAction(MVT VT) const {
1916   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1917       VT.getScalarType().bitsLE(MVT::i16))
1918     return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1919   return TargetLoweringBase::getPreferredVectorAction(VT);
1920 }
1921 
1922 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1923                                                          Type *Ty) const {
1924   // FIXME: Could be smarter if called for vector constants.
1925   return true;
1926 }
1927 
1928 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1929                                                unsigned Index) const {
1930   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
1931     return false;
1932 
1933   // TODO: Add more cases that are cheap.
1934   return Index == 0;
1935 }
1936 
1937 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1938   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1939     switch (Op) {
1940     case ISD::LOAD:
1941     case ISD::STORE:
1942 
1943     // These operations are done with 32-bit instructions anyway.
1944     case ISD::AND:
1945     case ISD::OR:
1946     case ISD::XOR:
1947     case ISD::SELECT:
1948       // TODO: Extensions?
1949       return true;
1950     default:
1951       return false;
1952     }
1953   }
1954 
1955   // SimplifySetCC uses this function to determine whether or not it should
1956   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
1957   if (VT == MVT::i1 && Op == ISD::SETCC)
1958     return false;
1959 
1960   return TargetLowering::isTypeDesirableForOp(Op, VT);
1961 }
1962 
1963 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1964                                                    const SDLoc &SL,
1965                                                    SDValue Chain,
1966                                                    uint64_t Offset) const {
1967   const DataLayout &DL = DAG.getDataLayout();
1968   MachineFunction &MF = DAG.getMachineFunction();
1969   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1970 
1971   const ArgDescriptor *InputPtrReg;
1972   const TargetRegisterClass *RC;
1973   LLT ArgTy;
1974   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1975 
1976   std::tie(InputPtrReg, RC, ArgTy) =
1977       Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1978 
1979   // We may not have the kernarg segment argument if we have no kernel
1980   // arguments.
1981   if (!InputPtrReg)
1982     return DAG.getConstant(Offset, SL, PtrVT);
1983 
1984   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1985   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1986     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1987 
1988   return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1989 }
1990 
1991 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1992                                             const SDLoc &SL) const {
1993   uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1994                                                FIRST_IMPLICIT);
1995   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1996 }
1997 
1998 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1999                                          const SDLoc &SL) const {
2000 
2001   Function &F = DAG.getMachineFunction().getFunction();
2002   std::optional<uint32_t> KnownSize =
2003       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2004   if (KnownSize.has_value())
2005     return DAG.getConstant(*KnownSize, SL, MVT::i32);
2006   return SDValue();
2007 }
2008 
2009 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2010                                          const SDLoc &SL, SDValue Val,
2011                                          bool Signed,
2012                                          const ISD::InputArg *Arg) const {
2013   // First, if it is a widened vector, narrow it.
2014   if (VT.isVector() &&
2015       VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2016     EVT NarrowedVT =
2017         EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
2018                          VT.getVectorNumElements());
2019     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2020                       DAG.getConstant(0, SL, MVT::i32));
2021   }
2022 
2023   // Then convert the vector elements or scalar value.
2024   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2025       VT.bitsLT(MemVT)) {
2026     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2027     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2028   }
2029 
2030   if (MemVT.isFloatingPoint())
2031     Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2032   else if (Signed)
2033     Val = DAG.getSExtOrTrunc(Val, SL, VT);
2034   else
2035     Val = DAG.getZExtOrTrunc(Val, SL, VT);
2036 
2037   return Val;
2038 }
2039 
2040 SDValue SITargetLowering::lowerKernargMemParameter(
2041     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2042     uint64_t Offset, Align Alignment, bool Signed,
2043     const ISD::InputArg *Arg) const {
2044   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2045 
2046   // Try to avoid using an extload by loading earlier than the argument address,
2047   // and extracting the relevant bits. The load should hopefully be merged with
2048   // the previous argument.
2049   if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2050     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2051     int64_t AlignDownOffset = alignDown(Offset, 4);
2052     int64_t OffsetDiff = Offset - AlignDownOffset;
2053 
2054     EVT IntVT = MemVT.changeTypeToInteger();
2055 
2056     // TODO: If we passed in the base kernel offset we could have a better
2057     // alignment than 4, but we don't really need it.
2058     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2059     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2060                                MachineMemOperand::MODereferenceable |
2061                                    MachineMemOperand::MOInvariant);
2062 
2063     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2064     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2065 
2066     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2067     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2068     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2069 
2070 
2071     return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2072   }
2073 
2074   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2075   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2076                              MachineMemOperand::MODereferenceable |
2077                                  MachineMemOperand::MOInvariant);
2078 
2079   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2080   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2081 }
2082 
2083 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2084                                               const SDLoc &SL, SDValue Chain,
2085                                               const ISD::InputArg &Arg) const {
2086   MachineFunction &MF = DAG.getMachineFunction();
2087   MachineFrameInfo &MFI = MF.getFrameInfo();
2088 
2089   if (Arg.Flags.isByVal()) {
2090     unsigned Size = Arg.Flags.getByValSize();
2091     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2092     return DAG.getFrameIndex(FrameIdx, MVT::i32);
2093   }
2094 
2095   unsigned ArgOffset = VA.getLocMemOffset();
2096   unsigned ArgSize = VA.getValVT().getStoreSize();
2097 
2098   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2099 
2100   // Create load nodes to retrieve arguments from the stack.
2101   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2102   SDValue ArgValue;
2103 
2104   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2105   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2106   MVT MemVT = VA.getValVT();
2107 
2108   switch (VA.getLocInfo()) {
2109   default:
2110     break;
2111   case CCValAssign::BCvt:
2112     MemVT = VA.getLocVT();
2113     break;
2114   case CCValAssign::SExt:
2115     ExtType = ISD::SEXTLOAD;
2116     break;
2117   case CCValAssign::ZExt:
2118     ExtType = ISD::ZEXTLOAD;
2119     break;
2120   case CCValAssign::AExt:
2121     ExtType = ISD::EXTLOAD;
2122     break;
2123   }
2124 
2125   ArgValue = DAG.getExtLoad(
2126     ExtType, SL, VA.getLocVT(), Chain, FIN,
2127     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
2128     MemVT);
2129   return ArgValue;
2130 }
2131 
2132 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2133   const SIMachineFunctionInfo &MFI,
2134   EVT VT,
2135   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2136   const ArgDescriptor *Reg = nullptr;
2137   const TargetRegisterClass *RC;
2138   LLT Ty;
2139 
2140   CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2141   const ArgDescriptor WorkGroupIDX =
2142       ArgDescriptor::createRegister(AMDGPU::TTMP9);
2143   // If GridZ is not programmed in an entry function then the hardware will set
2144   // it to all zeros, so there is no need to mask the GridY value in the low
2145   // order bits.
2146   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147       AMDGPU::TTMP7,
2148       AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149   const ArgDescriptor WorkGroupIDZ =
2150       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2151   if (Subtarget->hasArchitectedSGPRs() &&
2152       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2153     switch (PVID) {
2154     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2155       Reg = &WorkGroupIDX;
2156       RC = &AMDGPU::SReg_32RegClass;
2157       Ty = LLT::scalar(32);
2158       break;
2159     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2160       Reg = &WorkGroupIDY;
2161       RC = &AMDGPU::SReg_32RegClass;
2162       Ty = LLT::scalar(32);
2163       break;
2164     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2165       Reg = &WorkGroupIDZ;
2166       RC = &AMDGPU::SReg_32RegClass;
2167       Ty = LLT::scalar(32);
2168       break;
2169     default:
2170       break;
2171     }
2172   }
2173 
2174   if (!Reg)
2175     std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2176   if (!Reg) {
2177     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2178       // It's possible for a kernarg intrinsic call to appear in a kernel with
2179       // no allocated segment, in which case we do not add the user sgpr
2180       // argument, so just return null.
2181       return DAG.getConstant(0, SDLoc(), VT);
2182     }
2183 
2184     // It's undefined behavior if a function marked with the amdgpu-no-*
2185     // attributes uses the corresponding intrinsic.
2186     return DAG.getUNDEF(VT);
2187   }
2188 
2189   return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2190 }
2191 
2192 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2193                                CallingConv::ID CallConv,
2194                                ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195                                FunctionType *FType,
2196                                SIMachineFunctionInfo *Info) {
2197   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198     const ISD::InputArg *Arg = &Ins[I];
2199 
2200     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201            "vector type argument should have been split");
2202 
2203     // First check if it's a PS input addr.
2204     if (CallConv == CallingConv::AMDGPU_PS &&
2205         !Arg->Flags.isInReg() && PSInputNum <= 15) {
2206       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2207 
2208       // Inconveniently only the first part of the split is marked as isSplit,
2209       // so skip to the end. We only want to increment PSInputNum once for the
2210       // entire split argument.
2211       if (Arg->Flags.isSplit()) {
2212         while (!Arg->Flags.isSplitEnd()) {
2213           assert((!Arg->VT.isVector() ||
2214                   Arg->VT.getScalarSizeInBits() == 16) &&
2215                  "unexpected vector split in ps argument type");
2216           if (!SkipArg)
2217             Splits.push_back(*Arg);
2218           Arg = &Ins[++I];
2219         }
2220       }
2221 
2222       if (SkipArg) {
2223         // We can safely skip PS inputs.
2224         Skipped.set(Arg->getOrigArgIndex());
2225         ++PSInputNum;
2226         continue;
2227       }
2228 
2229       Info->markPSInputAllocated(PSInputNum);
2230       if (Arg->Used)
2231         Info->markPSInputEnabled(PSInputNum);
2232 
2233       ++PSInputNum;
2234     }
2235 
2236     Splits.push_back(*Arg);
2237   }
2238 }
2239 
2240 // Allocate special inputs passed in VGPRs.
2241 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
2242                                                       MachineFunction &MF,
2243                                                       const SIRegisterInfo &TRI,
2244                                                       SIMachineFunctionInfo &Info) const {
2245   const LLT S32 = LLT::scalar(32);
2246   MachineRegisterInfo &MRI = MF.getRegInfo();
2247 
2248   if (Info.hasWorkItemIDX()) {
2249     Register Reg = AMDGPU::VGPR0;
2250     MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2251 
2252     CCInfo.AllocateReg(Reg);
2253     unsigned Mask = (Subtarget->hasPackedTID() &&
2254                      Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2255     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2256   }
2257 
2258   if (Info.hasWorkItemIDY()) {
2259     assert(Info.hasWorkItemIDX());
2260     if (Subtarget->hasPackedTID()) {
2261       Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2262                                                         0x3ff << 10));
2263     } else {
2264       unsigned Reg = AMDGPU::VGPR1;
2265       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2266 
2267       CCInfo.AllocateReg(Reg);
2268       Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2269     }
2270   }
2271 
2272   if (Info.hasWorkItemIDZ()) {
2273     assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2274     if (Subtarget->hasPackedTID()) {
2275       Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2276                                                         0x3ff << 20));
2277     } else {
2278       unsigned Reg = AMDGPU::VGPR2;
2279       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2280 
2281       CCInfo.AllocateReg(Reg);
2282       Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2283     }
2284   }
2285 }
2286 
2287 // Try to allocate a VGPR at the end of the argument list, or if no argument
2288 // VGPRs are left allocating a stack slot.
2289 // If \p Mask is is given it indicates bitfield position in the register.
2290 // If \p Arg is given use it with new ]p Mask instead of allocating new.
2291 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2292                                          ArgDescriptor Arg = ArgDescriptor()) {
2293   if (Arg.isSet())
2294     return ArgDescriptor::createArg(Arg, Mask);
2295 
2296   ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2297   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2298   if (RegIdx == ArgVGPRs.size()) {
2299     // Spill to stack required.
2300     int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2301 
2302     return ArgDescriptor::createStack(Offset, Mask);
2303   }
2304 
2305   unsigned Reg = ArgVGPRs[RegIdx];
2306   Reg = CCInfo.AllocateReg(Reg);
2307   assert(Reg != AMDGPU::NoRegister);
2308 
2309   MachineFunction &MF = CCInfo.getMachineFunction();
2310   Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2311   MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2312   return ArgDescriptor::createRegister(Reg, Mask);
2313 }
2314 
2315 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2316                                              const TargetRegisterClass *RC,
2317                                              unsigned NumArgRegs) {
2318   ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2319   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2320   if (RegIdx == ArgSGPRs.size())
2321     report_fatal_error("ran out of SGPRs for arguments");
2322 
2323   unsigned Reg = ArgSGPRs[RegIdx];
2324   Reg = CCInfo.AllocateReg(Reg);
2325   assert(Reg != AMDGPU::NoRegister);
2326 
2327   MachineFunction &MF = CCInfo.getMachineFunction();
2328   MF.addLiveIn(Reg, RC);
2329   return ArgDescriptor::createRegister(Reg);
2330 }
2331 
2332 // If this has a fixed position, we still should allocate the register in the
2333 // CCInfo state. Technically we could get away with this for values passed
2334 // outside of the normal argument range.
2335 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2336                                        const TargetRegisterClass *RC,
2337                                        MCRegister Reg) {
2338   Reg = CCInfo.AllocateReg(Reg);
2339   assert(Reg != AMDGPU::NoRegister);
2340   MachineFunction &MF = CCInfo.getMachineFunction();
2341   MF.addLiveIn(Reg, RC);
2342 }
2343 
2344 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2345   if (Arg) {
2346     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2347                                Arg.getRegister());
2348   } else
2349     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2350 }
2351 
2352 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2353   if (Arg) {
2354     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2355                                Arg.getRegister());
2356   } else
2357     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2358 }
2359 
2360 /// Allocate implicit function VGPR arguments at the end of allocated user
2361 /// arguments.
2362 void SITargetLowering::allocateSpecialInputVGPRs(
2363   CCState &CCInfo, MachineFunction &MF,
2364   const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2365   const unsigned Mask = 0x3ff;
2366   ArgDescriptor Arg;
2367 
2368   if (Info.hasWorkItemIDX()) {
2369     Arg = allocateVGPR32Input(CCInfo, Mask);
2370     Info.setWorkItemIDX(Arg);
2371   }
2372 
2373   if (Info.hasWorkItemIDY()) {
2374     Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2375     Info.setWorkItemIDY(Arg);
2376   }
2377 
2378   if (Info.hasWorkItemIDZ())
2379     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2380 }
2381 
2382 /// Allocate implicit function VGPR arguments in fixed registers.
2383 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2384   CCState &CCInfo, MachineFunction &MF,
2385   const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2386   Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2387   if (!Reg)
2388     report_fatal_error("failed to allocated VGPR for implicit arguments");
2389 
2390   const unsigned Mask = 0x3ff;
2391   Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2392   Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2393   Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2394 }
2395 
2396 void SITargetLowering::allocateSpecialInputSGPRs(
2397   CCState &CCInfo,
2398   MachineFunction &MF,
2399   const SIRegisterInfo &TRI,
2400   SIMachineFunctionInfo &Info) const {
2401   auto &ArgInfo = Info.getArgInfo();
2402   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2403 
2404   // TODO: Unify handling with private memory pointers.
2405   if (UserSGPRInfo.hasDispatchPtr())
2406     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2407 
2408   const Module *M = MF.getFunction().getParent();
2409   if (UserSGPRInfo.hasQueuePtr() &&
2410       AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
2411     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2412 
2413   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2414   // constant offset from the kernarg segment.
2415   if (Info.hasImplicitArgPtr())
2416     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2417 
2418   if (UserSGPRInfo.hasDispatchID())
2419     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2420 
2421   // flat_scratch_init is not applicable for non-kernel functions.
2422 
2423   if (Info.hasWorkGroupIDX())
2424     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2425 
2426   if (Info.hasWorkGroupIDY())
2427     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2428 
2429   if (Info.hasWorkGroupIDZ())
2430     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2431 
2432   if (Info.hasLDSKernelId())
2433     allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2434 }
2435 
2436 // Allocate special inputs passed in user SGPRs.
2437 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2438                                             MachineFunction &MF,
2439                                             const SIRegisterInfo &TRI,
2440                                             SIMachineFunctionInfo &Info) const {
2441   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2442   if (UserSGPRInfo.hasImplicitBufferPtr()) {
2443     Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2444     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2445     CCInfo.AllocateReg(ImplicitBufferPtrReg);
2446   }
2447 
2448   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2449   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2450     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2451     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2452     CCInfo.AllocateReg(PrivateSegmentBufferReg);
2453   }
2454 
2455   if (UserSGPRInfo.hasDispatchPtr()) {
2456     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2457     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2458     CCInfo.AllocateReg(DispatchPtrReg);
2459   }
2460 
2461   const Module *M = MF.getFunction().getParent();
2462   if (UserSGPRInfo.hasQueuePtr() &&
2463       AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
2464     Register QueuePtrReg = Info.addQueuePtr(TRI);
2465     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2466     CCInfo.AllocateReg(QueuePtrReg);
2467   }
2468 
2469   if (UserSGPRInfo.hasKernargSegmentPtr()) {
2470     MachineRegisterInfo &MRI = MF.getRegInfo();
2471     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2472     CCInfo.AllocateReg(InputPtrReg);
2473 
2474     Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2475     MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2476   }
2477 
2478   if (UserSGPRInfo.hasDispatchID()) {
2479     Register DispatchIDReg = Info.addDispatchID(TRI);
2480     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2481     CCInfo.AllocateReg(DispatchIDReg);
2482   }
2483 
2484   if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2485     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2486     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2487     CCInfo.AllocateReg(FlatScratchInitReg);
2488   }
2489 
2490   if (UserSGPRInfo.hasPrivateSegmentSize()) {
2491     Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2492     MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2493     CCInfo.AllocateReg(PrivateSegmentSizeReg);
2494   }
2495 
2496   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2497   // these from the dispatch pointer.
2498 }
2499 
2500 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2501 // sequential starting from the first argument.
2502 void SITargetLowering::allocatePreloadKernArgSGPRs(
2503     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2504     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2505     const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2506   Function &F = MF.getFunction();
2507   unsigned LastExplicitArgOffset =
2508       MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2509   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2510   bool InPreloadSequence = true;
2511   unsigned InIdx = 0;
2512   for (auto &Arg : F.args()) {
2513     if (!InPreloadSequence || !Arg.hasInRegAttr())
2514       break;
2515 
2516     int ArgIdx = Arg.getArgNo();
2517     // Don't preload non-original args or parts not in the current preload
2518     // sequence.
2519     if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2520                                (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2521       break;
2522 
2523     for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2524            (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2525          InIdx++) {
2526       assert(ArgLocs[ArgIdx].isMemLoc());
2527       auto &ArgLoc = ArgLocs[InIdx];
2528       const Align KernelArgBaseAlign = Align(16);
2529       unsigned ArgOffset = ArgLoc.getLocMemOffset();
2530       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2531       unsigned NumAllocSGPRs =
2532           alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2533 
2534       // Arg is preloaded into the previous SGPR.
2535       if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2536         Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2537             Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2538         continue;
2539       }
2540 
2541       unsigned Padding = ArgOffset - LastExplicitArgOffset;
2542       unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2543       // Check for free user SGPRs for preloading.
2544       if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2545           SGPRInfo.getNumFreeUserSGPRs()) {
2546         InPreloadSequence = false;
2547         break;
2548       }
2549 
2550       // Preload this argument.
2551       const TargetRegisterClass *RC =
2552           TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2553       SmallVectorImpl<MCRegister> *PreloadRegs =
2554           Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2555 
2556       if (PreloadRegs->size() > 1)
2557         RC = &AMDGPU::SGPR_32RegClass;
2558       for (auto &Reg : *PreloadRegs) {
2559         assert(Reg);
2560         MF.addLiveIn(Reg, RC);
2561         CCInfo.AllocateReg(Reg);
2562       }
2563 
2564       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2565     }
2566   }
2567 }
2568 
2569 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2570                                            const SIRegisterInfo &TRI,
2571                                            SIMachineFunctionInfo &Info) const {
2572   // Always allocate this last since it is a synthetic preload.
2573   if (Info.hasLDSKernelId()) {
2574     Register Reg = Info.addLDSKernelId();
2575     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2576     CCInfo.AllocateReg(Reg);
2577   }
2578 }
2579 
2580 // Allocate special input registers that are initialized per-wave.
2581 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
2582                                            MachineFunction &MF,
2583                                            SIMachineFunctionInfo &Info,
2584                                            CallingConv::ID CallConv,
2585                                            bool IsShader) const {
2586   bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2587   if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2588     // Note: user SGPRs are handled by the front-end for graphics shaders
2589     // Pad up the used user SGPRs with dead inputs.
2590 
2591     // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2592     // before enabling architected SGPRs for workgroup IDs.
2593     assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2594 
2595     unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2596     // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2597     // rely on it to reach 16 since if we end up having no stack usage, it will
2598     // not really be added.
2599     unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2600                                       Info.hasWorkGroupIDY() +
2601                                       Info.hasWorkGroupIDZ() +
2602                                       Info.hasWorkGroupInfo();
2603     for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2604       Register Reg = Info.addReservedUserSGPR();
2605       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2606       CCInfo.AllocateReg(Reg);
2607     }
2608   }
2609 
2610   if (!HasArchitectedSGPRs) {
2611     if (Info.hasWorkGroupIDX()) {
2612       Register Reg = Info.addWorkGroupIDX();
2613       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2614       CCInfo.AllocateReg(Reg);
2615     }
2616 
2617     if (Info.hasWorkGroupIDY()) {
2618       Register Reg = Info.addWorkGroupIDY();
2619       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620       CCInfo.AllocateReg(Reg);
2621     }
2622 
2623     if (Info.hasWorkGroupIDZ()) {
2624       Register Reg = Info.addWorkGroupIDZ();
2625       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626       CCInfo.AllocateReg(Reg);
2627     }
2628   }
2629 
2630   if (Info.hasWorkGroupInfo()) {
2631     Register Reg = Info.addWorkGroupInfo();
2632     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633     CCInfo.AllocateReg(Reg);
2634   }
2635 
2636   if (Info.hasPrivateSegmentWaveByteOffset()) {
2637     // Scratch wave offset passed in system SGPR.
2638     unsigned PrivateSegmentWaveByteOffsetReg;
2639 
2640     if (IsShader) {
2641       PrivateSegmentWaveByteOffsetReg =
2642         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2643 
2644       // This is true if the scratch wave byte offset doesn't have a fixed
2645       // location.
2646       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2647         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2648         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2649       }
2650     } else
2651       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2652 
2653     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2654     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2655   }
2656 
2657   assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2658          Info.getNumPreloadedSGPRs() >= 16);
2659 }
2660 
2661 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2662                                      MachineFunction &MF,
2663                                      const SIRegisterInfo &TRI,
2664                                      SIMachineFunctionInfo &Info) {
2665   // Now that we've figured out where the scratch register inputs are, see if
2666   // should reserve the arguments and use them directly.
2667   MachineFrameInfo &MFI = MF.getFrameInfo();
2668   bool HasStackObjects = MFI.hasStackObjects();
2669   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2670 
2671   // Record that we know we have non-spill stack objects so we don't need to
2672   // check all stack objects later.
2673   if (HasStackObjects)
2674     Info.setHasNonSpillStackObjects(true);
2675 
2676   // Everything live out of a block is spilled with fast regalloc, so it's
2677   // almost certain that spilling will be required.
2678   if (TM.getOptLevel() == CodeGenOptLevel::None)
2679     HasStackObjects = true;
2680 
2681   // For now assume stack access is needed in any callee functions, so we need
2682   // the scratch registers to pass in.
2683   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2684 
2685   if (!ST.enableFlatScratch()) {
2686     if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2687       // If we have stack objects, we unquestionably need the private buffer
2688       // resource. For the Code Object V2 ABI, this will be the first 4 user
2689       // SGPR inputs. We can reserve those and use them directly.
2690 
2691       Register PrivateSegmentBufferReg =
2692           Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2693       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2694     } else {
2695       unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2696       // We tentatively reserve the last registers (skipping the last registers
2697       // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2698       // we'll replace these with the ones immediately after those which were
2699       // really allocated. In the prologue copies will be inserted from the
2700       // argument to these reserved registers.
2701 
2702       // Without HSA, relocations are used for the scratch pointer and the
2703       // buffer resource setup is always inserted in the prologue. Scratch wave
2704       // offset is still in an input SGPR.
2705       Info.setScratchRSrcReg(ReservedBufferReg);
2706     }
2707   }
2708 
2709   MachineRegisterInfo &MRI = MF.getRegInfo();
2710 
2711   // For entry functions we have to set up the stack pointer if we use it,
2712   // whereas non-entry functions get this "for free". This means there is no
2713   // intrinsic advantage to using S32 over S34 in cases where we do not have
2714   // calls but do need a frame pointer (i.e. if we are requested to have one
2715   // because frame pointer elimination is disabled). To keep things simple we
2716   // only ever use S32 as the call ABI stack pointer, and so using it does not
2717   // imply we need a separate frame pointer.
2718   //
2719   // Try to use s32 as the SP, but move it if it would interfere with input
2720   // arguments. This won't work with calls though.
2721   //
2722   // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2723   // registers.
2724   if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2725     Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2726   } else {
2727     assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2728 
2729     if (MFI.hasCalls())
2730       report_fatal_error("call in graphics shader with too many input SGPRs");
2731 
2732     for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2733       if (!MRI.isLiveIn(Reg)) {
2734         Info.setStackPtrOffsetReg(Reg);
2735         break;
2736       }
2737     }
2738 
2739     if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2740       report_fatal_error("failed to find register for SP");
2741   }
2742 
2743   // hasFP should be accurate for entry functions even before the frame is
2744   // finalized, because it does not rely on the known stack size, only
2745   // properties like whether variable sized objects are present.
2746   if (ST.getFrameLowering()->hasFP(MF)) {
2747     Info.setFrameOffsetReg(AMDGPU::SGPR33);
2748   }
2749 }
2750 
2751 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2752   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2753   return !Info->isEntryFunction();
2754 }
2755 
2756 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2757 
2758 }
2759 
2760 void SITargetLowering::insertCopiesSplitCSR(
2761   MachineBasicBlock *Entry,
2762   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2763   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2764 
2765   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2766   if (!IStart)
2767     return;
2768 
2769   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2770   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2771   MachineBasicBlock::iterator MBBI = Entry->begin();
2772   for (const MCPhysReg *I = IStart; *I; ++I) {
2773     const TargetRegisterClass *RC = nullptr;
2774     if (AMDGPU::SReg_64RegClass.contains(*I))
2775       RC = &AMDGPU::SGPR_64RegClass;
2776     else if (AMDGPU::SReg_32RegClass.contains(*I))
2777       RC = &AMDGPU::SGPR_32RegClass;
2778     else
2779       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2780 
2781     Register NewVR = MRI->createVirtualRegister(RC);
2782     // Create copy from CSR to a virtual register.
2783     Entry->addLiveIn(*I);
2784     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2785       .addReg(*I);
2786 
2787     // Insert the copy-back instructions right before the terminator.
2788     for (auto *Exit : Exits)
2789       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2790               TII->get(TargetOpcode::COPY), *I)
2791         .addReg(NewVR);
2792   }
2793 }
2794 
2795 SDValue SITargetLowering::LowerFormalArguments(
2796     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2797     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2798     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2799   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2800 
2801   MachineFunction &MF = DAG.getMachineFunction();
2802   const Function &Fn = MF.getFunction();
2803   FunctionType *FType = MF.getFunction().getFunctionType();
2804   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2805 
2806   if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2807     DiagnosticInfoUnsupported NoGraphicsHSA(
2808         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2809     DAG.getContext()->diagnose(NoGraphicsHSA);
2810     return DAG.getEntryNode();
2811   }
2812 
2813   SmallVector<ISD::InputArg, 16> Splits;
2814   SmallVector<CCValAssign, 16> ArgLocs;
2815   BitVector Skipped(Ins.size());
2816   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2817                  *DAG.getContext());
2818 
2819   bool IsGraphics = AMDGPU::isGraphics(CallConv);
2820   bool IsKernel = AMDGPU::isKernel(CallConv);
2821   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2822 
2823   if (IsGraphics) {
2824     const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2825     assert(!UserSGPRInfo.hasDispatchPtr() &&
2826            !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827            !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828            !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2829     (void)UserSGPRInfo;
2830     if (!Subtarget->enableFlatScratch())
2831       assert(!UserSGPRInfo.hasFlatScratchInit());
2832     if ((CallConv != CallingConv::AMDGPU_CS &&
2833          CallConv != CallingConv::AMDGPU_Gfx) ||
2834         !Subtarget->hasArchitectedSGPRs())
2835       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2836              !Info->hasWorkGroupIDZ());
2837   }
2838 
2839   if (CallConv == CallingConv::AMDGPU_PS) {
2840     processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2841 
2842     // At least one interpolation mode must be enabled or else the GPU will
2843     // hang.
2844     //
2845     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2846     // set PSInputAddr, the user wants to enable some bits after the compilation
2847     // based on run-time states. Since we can't know what the final PSInputEna
2848     // will look like, so we shouldn't do anything here and the user should take
2849     // responsibility for the correct programming.
2850     //
2851     // Otherwise, the following restrictions apply:
2852     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2853     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2854     //   enabled too.
2855     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2856         ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2857       CCInfo.AllocateReg(AMDGPU::VGPR0);
2858       CCInfo.AllocateReg(AMDGPU::VGPR1);
2859       Info->markPSInputAllocated(0);
2860       Info->markPSInputEnabled(0);
2861     }
2862     if (Subtarget->isAmdPalOS()) {
2863       // For isAmdPalOS, the user does not enable some bits after compilation
2864       // based on run-time states; the register values being generated here are
2865       // the final ones set in hardware. Therefore we need to apply the
2866       // workaround to PSInputAddr and PSInputEnable together.  (The case where
2867       // a bit is set in PSInputAddr but not PSInputEnable is where the
2868       // frontend set up an input arg for a particular interpolation mode, but
2869       // nothing uses that input arg. Really we should have an earlier pass
2870       // that removes such an arg.)
2871       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2872       if ((PsInputBits & 0x7F) == 0 ||
2873           ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2874         Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2875     }
2876   } else if (IsKernel) {
2877     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2878   } else {
2879     Splits.append(Ins.begin(), Ins.end());
2880   }
2881 
2882   if (IsKernel)
2883     analyzeFormalArgumentsCompute(CCInfo, Ins);
2884 
2885   if (IsEntryFunc) {
2886     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2887     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2888     if (IsKernel && Subtarget->hasKernargPreload())
2889       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2890 
2891     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2892   } else if (!IsGraphics) {
2893     // For the fixed ABI, pass workitem IDs in the last argument register.
2894     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2895 
2896     // FIXME: Sink this into allocateSpecialInputSGPRs
2897     if (!Subtarget->enableFlatScratch())
2898       CCInfo.AllocateReg(Info->getScratchRSrcReg());
2899 
2900     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2901   }
2902 
2903   if (!IsKernel) {
2904     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2905     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2906   }
2907 
2908   SmallVector<SDValue, 16> Chains;
2909 
2910   // FIXME: This is the minimum kernel argument alignment. We should improve
2911   // this to the maximum alignment of the arguments.
2912   //
2913   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2914   // kern arg offset.
2915   const Align KernelArgBaseAlign = Align(16);
2916 
2917   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2918     const ISD::InputArg &Arg = Ins[i];
2919     if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2920       InVals.push_back(DAG.getUNDEF(Arg.VT));
2921       continue;
2922     }
2923 
2924     CCValAssign &VA = ArgLocs[ArgIdx++];
2925     MVT VT = VA.getLocVT();
2926 
2927     if (IsEntryFunc && VA.isMemLoc()) {
2928       VT = Ins[i].VT;
2929       EVT MemVT = VA.getLocVT();
2930 
2931       const uint64_t Offset = VA.getLocMemOffset();
2932       Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2933 
2934       if (Arg.Flags.isByRef()) {
2935         SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2936 
2937         const GCNTargetMachine &TM =
2938             static_cast<const GCNTargetMachine &>(getTargetMachine());
2939         if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2940                                     Arg.Flags.getPointerAddrSpace())) {
2941           Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2942                                      Arg.Flags.getPointerAddrSpace());
2943         }
2944 
2945         InVals.push_back(Ptr);
2946         continue;
2947       }
2948 
2949       SDValue NewArg;
2950       if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2951         if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2952           // In this case the argument is packed into the previous preload SGPR.
2953           int64_t AlignDownOffset = alignDown(Offset, 4);
2954           int64_t OffsetDiff = Offset - AlignDownOffset;
2955           EVT IntVT = MemVT.changeTypeToInteger();
2956 
2957           const SIMachineFunctionInfo *Info =
2958               MF.getInfo<SIMachineFunctionInfo>();
2959           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2960           Register Reg =
2961               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2962 
2963           assert(Reg);
2964           Register VReg = MRI.getLiveInVirtReg(Reg);
2965           SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2966 
2967           SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2968           SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2969 
2970           SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2971           ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2972           NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2973                                   Ins[i].Flags.isSExt(), &Ins[i]);
2974 
2975           NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2976         } else {
2977           const SIMachineFunctionInfo *Info =
2978               MF.getInfo<SIMachineFunctionInfo>();
2979           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2980           const SmallVectorImpl<MCRegister> &PreloadRegs =
2981               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2982 
2983           SDValue Copy;
2984           if (PreloadRegs.size() == 1) {
2985             Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2986             const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2987             NewArg = DAG.getCopyFromReg(
2988                 Chain, DL, VReg,
2989                 EVT::getIntegerVT(*DAG.getContext(),
2990                                   TRI->getRegSizeInBits(*RC)));
2991 
2992           } else {
2993             // If the kernarg alignment does not match the alignment of the SGPR
2994             // tuple RC that can accommodate this argument, it will be built up
2995             // via copies from from the individual SGPRs that the argument was
2996             // preloaded to.
2997             SmallVector<SDValue, 4> Elts;
2998             for (auto Reg : PreloadRegs) {
2999               Register VReg = MRI.getLiveInVirtReg(Reg);
3000               Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3001               Elts.push_back(Copy);
3002             }
3003             NewArg =
3004                 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3005                                                     PreloadRegs.size()),
3006                                    DL, Elts);
3007           }
3008 
3009           // If the argument was preloaded to multiple consecutive 32-bit
3010           // registers because of misalignment between addressable SGPR tuples
3011           // and the argument size, we can still assume that because of kernarg
3012           // segment alignment restrictions that NewArg's size is the same as
3013           // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3014           // truncate since we cannot preload to less than a single SGPR and the
3015           // MemVT may be smaller.
3016           EVT MemVTInt =
3017               EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
3018           if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3019             NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3020 
3021           NewArg = DAG.getBitcast(MemVT, NewArg);
3022           NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3023                                   Ins[i].Flags.isSExt(), &Ins[i]);
3024           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3025         }
3026       } else {
3027         NewArg =
3028             lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3029                                      Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3030       }
3031       Chains.push_back(NewArg.getValue(1));
3032 
3033       auto *ParamTy =
3034         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3035       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3036           ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3037                       ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3038         // On SI local pointers are just offsets into LDS, so they are always
3039         // less than 16-bits.  On CI and newer they could potentially be
3040         // real pointers, so we can't guarantee their size.
3041         NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3042                              DAG.getValueType(MVT::i16));
3043       }
3044 
3045       InVals.push_back(NewArg);
3046       continue;
3047     }
3048     if (!IsEntryFunc && VA.isMemLoc()) {
3049       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3050       InVals.push_back(Val);
3051       if (!Arg.Flags.isByVal())
3052         Chains.push_back(Val.getValue(1));
3053       continue;
3054     }
3055 
3056     assert(VA.isRegLoc() && "Parameter must be in a register!");
3057 
3058     Register Reg = VA.getLocReg();
3059     const TargetRegisterClass *RC = nullptr;
3060     if (AMDGPU::VGPR_32RegClass.contains(Reg))
3061       RC = &AMDGPU::VGPR_32RegClass;
3062     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3063       RC = &AMDGPU::SGPR_32RegClass;
3064     else
3065       llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3066     EVT ValVT = VA.getValVT();
3067 
3068     Reg = MF.addLiveIn(Reg, RC);
3069     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3070 
3071     if (Arg.Flags.isSRet()) {
3072       // The return object should be reasonably addressable.
3073 
3074       // FIXME: This helps when the return is a real sret. If it is a
3075       // automatically inserted sret (i.e. CanLowerReturn returns false), an
3076       // extra copy is inserted in SelectionDAGBuilder which obscures this.
3077       unsigned NumBits
3078         = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3079       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3080         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3081     }
3082 
3083     // If this is an 8 or 16-bit value, it is really passed promoted
3084     // to 32 bits. Insert an assert[sz]ext to capture this, then
3085     // truncate to the right size.
3086     switch (VA.getLocInfo()) {
3087     case CCValAssign::Full:
3088       break;
3089     case CCValAssign::BCvt:
3090       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3091       break;
3092     case CCValAssign::SExt:
3093       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3094                         DAG.getValueType(ValVT));
3095       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3096       break;
3097     case CCValAssign::ZExt:
3098       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3099                         DAG.getValueType(ValVT));
3100       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3101       break;
3102     case CCValAssign::AExt:
3103       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3104       break;
3105     default:
3106       llvm_unreachable("Unknown loc info!");
3107     }
3108 
3109     InVals.push_back(Val);
3110   }
3111 
3112   // Start adding system SGPRs.
3113   if (IsEntryFunc)
3114     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3115 
3116   // DAG.getPass() returns nullptr when using new pass manager.
3117   // TODO: Use DAG.getMFAM() to access analysis result.
3118   if (DAG.getPass()) {
3119     auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3120     ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3121   }
3122 
3123   unsigned StackArgSize = CCInfo.getStackSize();
3124   Info->setBytesInStackArgArea(StackArgSize);
3125 
3126   return Chains.empty() ? Chain :
3127     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3128 }
3129 
3130 // TODO: If return values can't fit in registers, we should return as many as
3131 // possible in registers before passing on stack.
3132 bool SITargetLowering::CanLowerReturn(
3133   CallingConv::ID CallConv,
3134   MachineFunction &MF, bool IsVarArg,
3135   const SmallVectorImpl<ISD::OutputArg> &Outs,
3136   LLVMContext &Context) const {
3137   // Replacing returns with sret/stack usage doesn't make sense for shaders.
3138   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3139   // for shaders. Vector types should be explicitly handled by CC.
3140   if (AMDGPU::isEntryFunctionCC(CallConv))
3141     return true;
3142 
3143   SmallVector<CCValAssign, 16> RVLocs;
3144   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3145   if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3146     return false;
3147 
3148   // We must use the stack if return would require unavailable registers.
3149   unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3150   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3151   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3152     if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3153       return false;
3154 
3155   return true;
3156 }
3157 
3158 SDValue
3159 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3160                               bool isVarArg,
3161                               const SmallVectorImpl<ISD::OutputArg> &Outs,
3162                               const SmallVectorImpl<SDValue> &OutVals,
3163                               const SDLoc &DL, SelectionDAG &DAG) const {
3164   MachineFunction &MF = DAG.getMachineFunction();
3165   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3166 
3167   if (AMDGPU::isKernel(CallConv)) {
3168     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3169                                              OutVals, DL, DAG);
3170   }
3171 
3172   bool IsShader = AMDGPU::isShader(CallConv);
3173 
3174   Info->setIfReturnsVoid(Outs.empty());
3175   bool IsWaveEnd = Info->returnsVoid() && IsShader;
3176 
3177   // CCValAssign - represent the assignment of the return value to a location.
3178   SmallVector<CCValAssign, 48> RVLocs;
3179   SmallVector<ISD::OutputArg, 48> Splits;
3180 
3181   // CCState - Info about the registers and stack slots.
3182   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3183                  *DAG.getContext());
3184 
3185   // Analyze outgoing return values.
3186   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3187 
3188   SDValue Glue;
3189   SmallVector<SDValue, 48> RetOps;
3190   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3191 
3192   // Copy the result values into the output registers.
3193   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3194        ++I, ++RealRVLocIdx) {
3195     CCValAssign &VA = RVLocs[I];
3196     assert(VA.isRegLoc() && "Can only return in registers!");
3197     // TODO: Partially return in registers if return values don't fit.
3198     SDValue Arg = OutVals[RealRVLocIdx];
3199 
3200     // Copied from other backends.
3201     switch (VA.getLocInfo()) {
3202     case CCValAssign::Full:
3203       break;
3204     case CCValAssign::BCvt:
3205       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3206       break;
3207     case CCValAssign::SExt:
3208       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3209       break;
3210     case CCValAssign::ZExt:
3211       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3212       break;
3213     case CCValAssign::AExt:
3214       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3215       break;
3216     default:
3217       llvm_unreachable("Unknown loc info!");
3218     }
3219 
3220     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3221     Glue = Chain.getValue(1);
3222     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3223   }
3224 
3225   // FIXME: Does sret work properly?
3226   if (!Info->isEntryFunction()) {
3227     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3228     const MCPhysReg *I =
3229       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3230     if (I) {
3231       for (; *I; ++I) {
3232         if (AMDGPU::SReg_64RegClass.contains(*I))
3233           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3234         else if (AMDGPU::SReg_32RegClass.contains(*I))
3235           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3236         else
3237           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3238       }
3239     }
3240   }
3241 
3242   // Update chain and glue.
3243   RetOps[0] = Chain;
3244   if (Glue.getNode())
3245     RetOps.push_back(Glue);
3246 
3247   unsigned Opc = AMDGPUISD::ENDPGM;
3248   if (!IsWaveEnd)
3249     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3250   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3251 }
3252 
3253 SDValue SITargetLowering::LowerCallResult(
3254     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3255     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3256     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3257     SDValue ThisVal) const {
3258   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3259 
3260   // Assign locations to each value returned by this call.
3261   SmallVector<CCValAssign, 16> RVLocs;
3262   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3263                  *DAG.getContext());
3264   CCInfo.AnalyzeCallResult(Ins, RetCC);
3265 
3266   // Copy all of the result registers out of their specified physreg.
3267   for (CCValAssign VA : RVLocs) {
3268     SDValue Val;
3269 
3270     if (VA.isRegLoc()) {
3271       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3272       Chain = Val.getValue(1);
3273       InGlue = Val.getValue(2);
3274     } else if (VA.isMemLoc()) {
3275       report_fatal_error("TODO: return values in memory");
3276     } else
3277       llvm_unreachable("unknown argument location type");
3278 
3279     switch (VA.getLocInfo()) {
3280     case CCValAssign::Full:
3281       break;
3282     case CCValAssign::BCvt:
3283       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3284       break;
3285     case CCValAssign::ZExt:
3286       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3287                         DAG.getValueType(VA.getValVT()));
3288       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3289       break;
3290     case CCValAssign::SExt:
3291       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3292                         DAG.getValueType(VA.getValVT()));
3293       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3294       break;
3295     case CCValAssign::AExt:
3296       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3297       break;
3298     default:
3299       llvm_unreachable("Unknown loc info!");
3300     }
3301 
3302     InVals.push_back(Val);
3303   }
3304 
3305   return Chain;
3306 }
3307 
3308 // Add code to pass special inputs required depending on used features separate
3309 // from the explicit user arguments present in the IR.
3310 void SITargetLowering::passSpecialInputs(
3311     CallLoweringInfo &CLI,
3312     CCState &CCInfo,
3313     const SIMachineFunctionInfo &Info,
3314     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3315     SmallVectorImpl<SDValue> &MemOpChains,
3316     SDValue Chain) const {
3317   // If we don't have a call site, this was a call inserted by
3318   // legalization. These can never use special inputs.
3319   if (!CLI.CB)
3320     return;
3321 
3322   SelectionDAG &DAG = CLI.DAG;
3323   const SDLoc &DL = CLI.DL;
3324   const Function &F = DAG.getMachineFunction().getFunction();
3325 
3326   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3327   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3328 
3329   const AMDGPUFunctionArgInfo *CalleeArgInfo
3330     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3331   if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3332     // DAG.getPass() returns nullptr when using new pass manager.
3333     // TODO: Use DAG.getMFAM() to access analysis result.
3334     if (DAG.getPass()) {
3335       auto &ArgUsageInfo =
3336           DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3337       CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3338     }
3339   }
3340 
3341   // TODO: Unify with private memory register handling. This is complicated by
3342   // the fact that at least in kernels, the input argument is not necessarily
3343   // in the same location as the input.
3344   static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3345                              StringLiteral> ImplicitAttrs[] = {
3346     {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3347     {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3348     {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3349     {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3350     {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3351     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3352     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3353     {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3354   };
3355 
3356   for (auto Attr : ImplicitAttrs) {
3357     const ArgDescriptor *OutgoingArg;
3358     const TargetRegisterClass *ArgRC;
3359     LLT ArgTy;
3360 
3361     AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3362 
3363     // If the callee does not use the attribute value, skip copying the value.
3364     if (CLI.CB->hasFnAttr(Attr.second))
3365       continue;
3366 
3367     std::tie(OutgoingArg, ArgRC, ArgTy) =
3368         CalleeArgInfo->getPreloadedValue(InputID);
3369     if (!OutgoingArg)
3370       continue;
3371 
3372     const ArgDescriptor *IncomingArg;
3373     const TargetRegisterClass *IncomingArgRC;
3374     LLT Ty;
3375     std::tie(IncomingArg, IncomingArgRC, Ty) =
3376         CallerArgInfo.getPreloadedValue(InputID);
3377     assert(IncomingArgRC == ArgRC);
3378 
3379     // All special arguments are ints for now.
3380     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3381     SDValue InputReg;
3382 
3383     if (IncomingArg) {
3384       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3385     } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3386       // The implicit arg ptr is special because it doesn't have a corresponding
3387       // input for kernels, and is computed from the kernarg segment pointer.
3388       InputReg = getImplicitArgPtr(DAG, DL);
3389     } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3390       std::optional<uint32_t> Id =
3391           AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3392       if (Id.has_value()) {
3393         InputReg = DAG.getConstant(*Id, DL, ArgVT);
3394       } else {
3395         InputReg = DAG.getUNDEF(ArgVT);
3396       }
3397     } else {
3398       // We may have proven the input wasn't needed, although the ABI is
3399       // requiring it. We just need to allocate the register appropriately.
3400       InputReg = DAG.getUNDEF(ArgVT);
3401     }
3402 
3403     if (OutgoingArg->isRegister()) {
3404       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3405       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3406         report_fatal_error("failed to allocate implicit input argument");
3407     } else {
3408       unsigned SpecialArgOffset =
3409           CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3410       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3411                                               SpecialArgOffset);
3412       MemOpChains.push_back(ArgStore);
3413     }
3414   }
3415 
3416   // Pack workitem IDs into a single register or pass it as is if already
3417   // packed.
3418   const ArgDescriptor *OutgoingArg;
3419   const TargetRegisterClass *ArgRC;
3420   LLT Ty;
3421 
3422   std::tie(OutgoingArg, ArgRC, Ty) =
3423       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3424   if (!OutgoingArg)
3425     std::tie(OutgoingArg, ArgRC, Ty) =
3426         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3427   if (!OutgoingArg)
3428     std::tie(OutgoingArg, ArgRC, Ty) =
3429         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3430   if (!OutgoingArg)
3431     return;
3432 
3433   const ArgDescriptor *IncomingArgX = std::get<0>(
3434       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3435   const ArgDescriptor *IncomingArgY = std::get<0>(
3436       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3437   const ArgDescriptor *IncomingArgZ = std::get<0>(
3438       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3439 
3440   SDValue InputReg;
3441   SDLoc SL;
3442 
3443   const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3444   const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3445   const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3446 
3447   // If incoming ids are not packed we need to pack them.
3448   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3449       NeedWorkItemIDX) {
3450     if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3451       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3452     } else {
3453       InputReg = DAG.getConstant(0, DL, MVT::i32);
3454     }
3455   }
3456 
3457   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3458       NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3459     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3460     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3461                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
3462     InputReg = InputReg.getNode() ?
3463                  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3464   }
3465 
3466   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3467       NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3468     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3469     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3470                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
3471     InputReg = InputReg.getNode() ?
3472                  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3473   }
3474 
3475   if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3476     if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3477       // We're in a situation where the outgoing function requires the workitem
3478       // ID, but the calling function does not have it (e.g a graphics function
3479       // calling a C calling convention function). This is illegal, but we need
3480       // to produce something.
3481       InputReg = DAG.getUNDEF(MVT::i32);
3482     } else {
3483       // Workitem ids are already packed, any of present incoming arguments
3484       // will carry all required fields.
3485       ArgDescriptor IncomingArg = ArgDescriptor::createArg(
3486         IncomingArgX ? *IncomingArgX :
3487         IncomingArgY ? *IncomingArgY :
3488         *IncomingArgZ, ~0u);
3489       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3490     }
3491   }
3492 
3493   if (OutgoingArg->isRegister()) {
3494     if (InputReg)
3495       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3496 
3497     CCInfo.AllocateReg(OutgoingArg->getRegister());
3498   } else {
3499     unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3500     if (InputReg) {
3501       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3502                                               SpecialArgOffset);
3503       MemOpChains.push_back(ArgStore);
3504     }
3505   }
3506 }
3507 
3508 static bool canGuaranteeTCO(CallingConv::ID CC) {
3509   return CC == CallingConv::Fast;
3510 }
3511 
3512 /// Return true if we might ever do TCO for calls with this calling convention.
3513 static bool mayTailCallThisCC(CallingConv::ID CC) {
3514   switch (CC) {
3515   case CallingConv::C:
3516   case CallingConv::AMDGPU_Gfx:
3517     return true;
3518   default:
3519     return canGuaranteeTCO(CC);
3520   }
3521 }
3522 
3523 bool SITargetLowering::isEligibleForTailCallOptimization(
3524     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3525     const SmallVectorImpl<ISD::OutputArg> &Outs,
3526     const SmallVectorImpl<SDValue> &OutVals,
3527     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3528   if (AMDGPU::isChainCC(CalleeCC))
3529     return true;
3530 
3531   if (!mayTailCallThisCC(CalleeCC))
3532     return false;
3533 
3534   // For a divergent call target, we need to do a waterfall loop over the
3535   // possible callees which precludes us from using a simple jump.
3536   if (Callee->isDivergent())
3537     return false;
3538 
3539   MachineFunction &MF = DAG.getMachineFunction();
3540   const Function &CallerF = MF.getFunction();
3541   CallingConv::ID CallerCC = CallerF.getCallingConv();
3542   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3543   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3544 
3545   // Kernels aren't callable, and don't have a live in return address so it
3546   // doesn't make sense to do a tail call with entry functions.
3547   if (!CallerPreserved)
3548     return false;
3549 
3550   bool CCMatch = CallerCC == CalleeCC;
3551 
3552   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3553     if (canGuaranteeTCO(CalleeCC) && CCMatch)
3554       return true;
3555     return false;
3556   }
3557 
3558   // TODO: Can we handle var args?
3559   if (IsVarArg)
3560     return false;
3561 
3562   for (const Argument &Arg : CallerF.args()) {
3563     if (Arg.hasByValAttr())
3564       return false;
3565   }
3566 
3567   LLVMContext &Ctx = *DAG.getContext();
3568 
3569   // Check that the call results are passed in the same way.
3570   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3571                                   CCAssignFnForCall(CalleeCC, IsVarArg),
3572                                   CCAssignFnForCall(CallerCC, IsVarArg)))
3573     return false;
3574 
3575   // The callee has to preserve all registers the caller needs to preserve.
3576   if (!CCMatch) {
3577     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3578     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3579       return false;
3580   }
3581 
3582   // Nothing more to check if the callee is taking no arguments.
3583   if (Outs.empty())
3584     return true;
3585 
3586   SmallVector<CCValAssign, 16> ArgLocs;
3587   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3588 
3589   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3590 
3591   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3592   // If the stack arguments for this call do not fit into our own save area then
3593   // the call cannot be made tail.
3594   // TODO: Is this really necessary?
3595   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3596     return false;
3597 
3598   const MachineRegisterInfo &MRI = MF.getRegInfo();
3599   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3600 }
3601 
3602 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3603   if (!CI->isTailCall())
3604     return false;
3605 
3606   const Function *ParentFn = CI->getParent()->getParent();
3607   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3608     return false;
3609   return true;
3610 }
3611 
3612 // The wave scratch offset register is used as the global base pointer.
3613 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3614                                     SmallVectorImpl<SDValue> &InVals) const {
3615   CallingConv::ID CallConv = CLI.CallConv;
3616   bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3617 
3618   SelectionDAG &DAG = CLI.DAG;
3619 
3620   TargetLowering::ArgListEntry RequestedExec;
3621   if (IsChainCallConv) {
3622     // The last argument should be the value that we need to put in EXEC.
3623     // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3624     // don't treat it like the rest of the arguments.
3625     RequestedExec = CLI.Args.back();
3626     assert(RequestedExec.Node && "No node for EXEC");
3627 
3628     if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3629       return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3630 
3631     assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3632     CLI.Outs.pop_back();
3633     CLI.OutVals.pop_back();
3634 
3635     if (RequestedExec.Ty->isIntegerTy(64)) {
3636       assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3637       CLI.Outs.pop_back();
3638       CLI.OutVals.pop_back();
3639     }
3640 
3641     assert(CLI.Outs.back().OrigArgIndex != 2 &&
3642            "Haven't popped all the pieces of the EXEC mask");
3643   }
3644 
3645   const SDLoc &DL = CLI.DL;
3646   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3647   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3648   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3649   SDValue Chain = CLI.Chain;
3650   SDValue Callee = CLI.Callee;
3651   bool &IsTailCall = CLI.IsTailCall;
3652   bool IsVarArg = CLI.IsVarArg;
3653   bool IsSibCall = false;
3654   MachineFunction &MF = DAG.getMachineFunction();
3655 
3656   if (Callee.isUndef() || isNullConstant(Callee)) {
3657     if (!CLI.IsTailCall) {
3658       for (ISD::InputArg &Arg : CLI.Ins)
3659         InVals.push_back(DAG.getUNDEF(Arg.VT));
3660     }
3661 
3662     return Chain;
3663   }
3664 
3665   if (IsVarArg) {
3666     return lowerUnhandledCall(CLI, InVals,
3667                               "unsupported call to variadic function ");
3668   }
3669 
3670   if (!CLI.CB)
3671     report_fatal_error("unsupported libcall legalization");
3672 
3673   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3674     return lowerUnhandledCall(CLI, InVals,
3675                               "unsupported required tail call to function ");
3676   }
3677 
3678   if (IsTailCall) {
3679     IsTailCall = isEligibleForTailCallOptimization(
3680       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3681     if (!IsTailCall &&
3682         ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3683       report_fatal_error("failed to perform tail call elimination on a call "
3684                          "site marked musttail or on llvm.amdgcn.cs.chain");
3685     }
3686 
3687     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3688 
3689     // A sibling call is one where we're under the usual C ABI and not planning
3690     // to change that but can still do a tail call:
3691     if (!TailCallOpt && IsTailCall)
3692       IsSibCall = true;
3693 
3694     if (IsTailCall)
3695       ++NumTailCalls;
3696   }
3697 
3698   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3699   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3700   SmallVector<SDValue, 8> MemOpChains;
3701 
3702   // Analyze operands of the call, assigning locations to each operand.
3703   SmallVector<CCValAssign, 16> ArgLocs;
3704   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3705   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3706 
3707   if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3708     // With a fixed ABI, allocate fixed registers before user arguments.
3709     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3710   }
3711 
3712   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3713 
3714   // Get a count of how many bytes are to be pushed on the stack.
3715   unsigned NumBytes = CCInfo.getStackSize();
3716 
3717   if (IsSibCall) {
3718     // Since we're not changing the ABI to make this a tail call, the memory
3719     // operands are already available in the caller's incoming argument space.
3720     NumBytes = 0;
3721   }
3722 
3723   // FPDiff is the byte offset of the call's argument area from the callee's.
3724   // Stores to callee stack arguments will be placed in FixedStackSlots offset
3725   // by this amount for a tail call. In a sibling call it must be 0 because the
3726   // caller will deallocate the entire stack and the callee still expects its
3727   // arguments to begin at SP+0. Completely unused for non-tail calls.
3728   int32_t FPDiff = 0;
3729   MachineFrameInfo &MFI = MF.getFrameInfo();
3730 
3731   // Adjust the stack pointer for the new arguments...
3732   // These operations are automatically eliminated by the prolog/epilog pass
3733   if (!IsSibCall)
3734     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3735 
3736   if (!IsSibCall || IsChainCallConv) {
3737     if (!Subtarget->enableFlatScratch()) {
3738       SmallVector<SDValue, 4> CopyFromChains;
3739 
3740       // In the HSA case, this should be an identity copy.
3741       SDValue ScratchRSrcReg
3742         = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3743       RegsToPass.emplace_back(IsChainCallConv
3744                                   ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3745                                   : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3746                               ScratchRSrcReg);
3747       CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3748       Chain = DAG.getTokenFactor(DL, CopyFromChains);
3749     }
3750   }
3751 
3752   MVT PtrVT = MVT::i32;
3753 
3754   // Walk the register/memloc assignments, inserting copies/loads.
3755   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3756     CCValAssign &VA = ArgLocs[i];
3757     SDValue Arg = OutVals[i];
3758 
3759     // Promote the value if needed.
3760     switch (VA.getLocInfo()) {
3761     case CCValAssign::Full:
3762       break;
3763     case CCValAssign::BCvt:
3764       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3765       break;
3766     case CCValAssign::ZExt:
3767       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3768       break;
3769     case CCValAssign::SExt:
3770       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3771       break;
3772     case CCValAssign::AExt:
3773       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3774       break;
3775     case CCValAssign::FPExt:
3776       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3777       break;
3778     default:
3779       llvm_unreachable("Unknown loc info!");
3780     }
3781 
3782     if (VA.isRegLoc()) {
3783       RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3784     } else {
3785       assert(VA.isMemLoc());
3786 
3787       SDValue DstAddr;
3788       MachinePointerInfo DstInfo;
3789 
3790       unsigned LocMemOffset = VA.getLocMemOffset();
3791       int32_t Offset = LocMemOffset;
3792 
3793       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3794       MaybeAlign Alignment;
3795 
3796       if (IsTailCall) {
3797         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3798         unsigned OpSize = Flags.isByVal() ?
3799           Flags.getByValSize() : VA.getValVT().getStoreSize();
3800 
3801         // FIXME: We can have better than the minimum byval required alignment.
3802         Alignment =
3803             Flags.isByVal()
3804                 ? Flags.getNonZeroByValAlign()
3805                 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3806 
3807         Offset = Offset + FPDiff;
3808         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3809 
3810         DstAddr = DAG.getFrameIndex(FI, PtrVT);
3811         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3812 
3813         // Make sure any stack arguments overlapping with where we're storing
3814         // are loaded before this eventual operation. Otherwise they'll be
3815         // clobbered.
3816 
3817         // FIXME: Why is this really necessary? This seems to just result in a
3818         // lot of code to copy the stack and write them back to the same
3819         // locations, which are supposed to be immutable?
3820         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3821       } else {
3822         // Stores to the argument stack area are relative to the stack pointer.
3823         SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3824                                         MVT::i32);
3825         DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3826         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3827         Alignment =
3828             commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3829       }
3830 
3831       if (Outs[i].Flags.isByVal()) {
3832         SDValue SizeNode =
3833             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3834         SDValue Cpy =
3835             DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3836                           Outs[i].Flags.getNonZeroByValAlign(),
3837                           /*isVol = */ false, /*AlwaysInline = */ true,
3838                           /*CI=*/nullptr, std::nullopt, DstInfo,
3839                           MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3840 
3841         MemOpChains.push_back(Cpy);
3842       } else {
3843         SDValue Store =
3844             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3845         MemOpChains.push_back(Store);
3846       }
3847     }
3848   }
3849 
3850   if (!MemOpChains.empty())
3851     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3852 
3853   // Build a sequence of copy-to-reg nodes chained together with token chain
3854   // and flag operands which copy the outgoing args into the appropriate regs.
3855   SDValue InGlue;
3856   for (auto &RegToPass : RegsToPass) {
3857     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3858                              RegToPass.second, InGlue);
3859     InGlue = Chain.getValue(1);
3860   }
3861 
3862 
3863   // We don't usually want to end the call-sequence here because we would tidy
3864   // the frame up *after* the call, however in the ABI-changing tail-call case
3865   // we've carefully laid out the parameters so that when sp is reset they'll be
3866   // in the correct location.
3867   if (IsTailCall && !IsSibCall) {
3868     Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3869     InGlue = Chain.getValue(1);
3870   }
3871 
3872   std::vector<SDValue> Ops;
3873   Ops.push_back(Chain);
3874   Ops.push_back(Callee);
3875   // Add a redundant copy of the callee global which will not be legalized, as
3876   // we need direct access to the callee later.
3877   if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3878     const GlobalValue *GV = GSD->getGlobal();
3879     Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3880   } else {
3881     Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3882   }
3883 
3884   if (IsTailCall) {
3885     // Each tail call may have to adjust the stack by a different amount, so
3886     // this information must travel along with the operation for eventual
3887     // consumption by emitEpilogue.
3888     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3889   }
3890 
3891   if (IsChainCallConv)
3892     Ops.push_back(RequestedExec.Node);
3893 
3894   // Add argument registers to the end of the list so that they are known live
3895   // into the call.
3896   for (auto &RegToPass : RegsToPass) {
3897     Ops.push_back(DAG.getRegister(RegToPass.first,
3898                                   RegToPass.second.getValueType()));
3899   }
3900 
3901   // Add a register mask operand representing the call-preserved registers.
3902   auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3903   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3904   assert(Mask && "Missing call preserved mask for calling convention");
3905   Ops.push_back(DAG.getRegisterMask(Mask));
3906 
3907   if (SDValue Token = CLI.ConvergenceControlToken) {
3908     SmallVector<SDValue, 2> GlueOps;
3909     GlueOps.push_back(Token);
3910     if (InGlue)
3911       GlueOps.push_back(InGlue);
3912 
3913     InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3914                                         MVT::Glue, GlueOps),
3915                      0);
3916   }
3917 
3918   if (InGlue)
3919     Ops.push_back(InGlue);
3920 
3921   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3922 
3923   // If we're doing a tall call, use a TC_RETURN here rather than an
3924   // actual call instruction.
3925   if (IsTailCall) {
3926     MFI.setHasTailCall();
3927     unsigned OPC = AMDGPUISD::TC_RETURN;
3928     switch (CallConv) {
3929     case CallingConv::AMDGPU_Gfx:
3930       OPC = AMDGPUISD::TC_RETURN_GFX;
3931       break;
3932     case CallingConv::AMDGPU_CS_Chain:
3933     case CallingConv::AMDGPU_CS_ChainPreserve:
3934       OPC = AMDGPUISD::TC_RETURN_CHAIN;
3935       break;
3936     }
3937 
3938     return DAG.getNode(OPC, DL, NodeTys, Ops);
3939   }
3940 
3941   // Returns a chain and a flag for retval copy to use.
3942   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3943   Chain = Call.getValue(0);
3944   InGlue = Call.getValue(1);
3945 
3946   uint64_t CalleePopBytes = NumBytes;
3947   Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3948   if (!Ins.empty())
3949     InGlue = Chain.getValue(1);
3950 
3951   // Handle result values, copying them out of physregs into vregs that we
3952   // return.
3953   return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3954                          InVals, /*IsThisReturn=*/false, SDValue());
3955 }
3956 
3957 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3958 // except for applying the wave size scale to the increment amount.
3959 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3960     SDValue Op, SelectionDAG &DAG) const {
3961   const MachineFunction &MF = DAG.getMachineFunction();
3962   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3963 
3964   SDLoc dl(Op);
3965   EVT VT = Op.getValueType();
3966   SDValue Tmp1 = Op;
3967   SDValue Tmp2 = Op.getValue(1);
3968   SDValue Tmp3 = Op.getOperand(2);
3969   SDValue Chain = Tmp1.getOperand(0);
3970 
3971   Register SPReg = Info->getStackPtrOffsetReg();
3972 
3973   // Chain the dynamic stack allocation so that it doesn't modify the stack
3974   // pointer when other instructions are using the stack.
3975   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3976 
3977   SDValue Size  = Tmp2.getOperand(1);
3978   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3979   Chain = SP.getValue(1);
3980   MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3981   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3982   unsigned Opc =
3983     TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3984     ISD::ADD : ISD::SUB;
3985 
3986   SDValue ScaledSize = DAG.getNode(
3987       ISD::SHL, dl, VT, Size,
3988       DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3989 
3990   Align StackAlign = TFL->getStackAlign();
3991   Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3992   if (Alignment && *Alignment > StackAlign) {
3993     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3994                        DAG.getConstant(-(uint64_t)Alignment->value()
3995                                            << Subtarget->getWavefrontSizeLog2(),
3996                                        dl, VT));
3997   }
3998 
3999   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
4000   Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4001 
4002   return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4003 }
4004 
4005 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4006                                                   SelectionDAG &DAG) const {
4007   // We only handle constant sizes here to allow non-entry block, static sized
4008   // allocas. A truly dynamic value is more difficult to support because we
4009   // don't know if the size value is uniform or not. If the size isn't uniform,
4010   // we would need to do a wave reduction to get the maximum size to know how
4011   // much to increment the uniform stack pointer.
4012   SDValue Size = Op.getOperand(1);
4013   if (isa<ConstantSDNode>(Size))
4014       return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4015 
4016   return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
4017 }
4018 
4019 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4020   if (Op.getValueType() != MVT::i32)
4021     return Op; // Defer to cannot select error.
4022 
4023   Register SP = getStackPointerRegisterToSaveRestore();
4024   SDLoc SL(Op);
4025 
4026   SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4027 
4028   // Convert from wave uniform to swizzled vector address. This should protect
4029   // from any edge cases where the stacksave result isn't directly used with
4030   // stackrestore.
4031   SDValue VectorAddress =
4032       DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4033   return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4034 }
4035 
4036 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4037                                             SelectionDAG &DAG) const {
4038   SDLoc SL(Op);
4039   assert(Op.getValueType() == MVT::i32);
4040 
4041   uint32_t BothRoundHwReg =
4042       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4043   SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4044 
4045   SDValue IntrinID =
4046       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4047   SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4048                                Op.getOperand(0), IntrinID, GetRoundBothImm);
4049 
4050   // There are two rounding modes, one for f32 and one for f64/f16. We only
4051   // report in the standard value range if both are the same.
4052   //
4053   // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4054   // ties away from zero is not supported, and the other values are rotated by
4055   // 1.
4056   //
4057   // If the two rounding modes are not the same, report a target defined value.
4058 
4059   // Mode register rounding mode fields:
4060   //
4061   // [1:0] Single-precision round mode.
4062   // [3:2] Double/Half-precision round mode.
4063   //
4064   // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4065   //
4066   //             Hardware   Spec
4067   // Toward-0        3        0
4068   // Nearest Even    0        1
4069   // +Inf            1        2
4070   // -Inf            2        3
4071   //  NearestAway0  N/A       4
4072   //
4073   // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4074   // table we can index by the raw hardware mode.
4075   //
4076   // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4077 
4078   SDValue BitTable =
4079       DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
4080 
4081   SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4082   SDValue RoundModeTimesNumBits =
4083       DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4084 
4085   // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4086   // knew only one mode was demanded.
4087   SDValue TableValue =
4088       DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4089   SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4090 
4091   SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4092   SDValue TableEntry =
4093       DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4094 
4095   // There's a gap in the 4-bit encoded table and actual enum values, so offset
4096   // if it's an extended value.
4097   SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4098   SDValue IsStandardValue =
4099       DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4100   SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4101   SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4102                                TableEntry, EnumOffset);
4103 
4104   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4105 }
4106 
4107 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4108                                             SelectionDAG &DAG) const {
4109   SDLoc SL(Op);
4110 
4111   SDValue NewMode = Op.getOperand(1);
4112   assert(NewMode.getValueType() == MVT::i32);
4113 
4114   // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4115   // hardware MODE.fp_round values.
4116   if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4117     uint32_t ClampedVal = std::min(
4118         static_cast<uint32_t>(ConstMode->getZExtValue()),
4119         static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4120     NewMode = DAG.getConstant(
4121         AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4122   } else {
4123     // If we know the input can only be one of the supported standard modes in
4124     // the range 0-3, we can use a simplified mapping to hardware values.
4125     KnownBits KB = DAG.computeKnownBits(NewMode);
4126     const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4127     // The supported standard values are 0-3. The extended values start at 8. We
4128     // need to offset by 4 if the value is in the extended range.
4129 
4130     if (UseReducedTable) {
4131       // Truncate to the low 32-bits.
4132       SDValue BitTable = DAG.getConstant(
4133           AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4134 
4135       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4136       SDValue RoundModeTimesNumBits =
4137           DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4138 
4139       NewMode =
4140           DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4141 
4142       // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4143       // the table extracted bits into inline immediates.
4144     } else {
4145       // table_index = umin(value, value - 4)
4146       // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4147       SDValue BitTable =
4148           DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4149 
4150       SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4151       SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4152       SDValue IndexVal =
4153           DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4154 
4155       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4156       SDValue RoundModeTimesNumBits =
4157           DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4158 
4159       SDValue TableValue =
4160           DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4161       SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4162 
4163       // No need to mask out the high bits since the setreg will ignore them
4164       // anyway.
4165       NewMode = TruncTable;
4166     }
4167 
4168     // Insert a readfirstlane in case the value is a VGPR. We could do this
4169     // earlier and keep more operations scalar, but that interferes with
4170     // combining the source.
4171     SDValue ReadFirstLaneID =
4172         DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4173     NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4174                           ReadFirstLaneID, NewMode);
4175   }
4176 
4177   // N.B. The setreg will be later folded into s_round_mode on supported
4178   // targets.
4179   SDValue IntrinID =
4180       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4181   uint32_t BothRoundHwReg =
4182       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4183   SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4184 
4185   SDValue SetReg =
4186       DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4187                   IntrinID, RoundBothImm, NewMode);
4188 
4189   return SetReg;
4190 }
4191 
4192 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4193   if (Op->isDivergent())
4194     return SDValue();
4195 
4196   switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4197   case AMDGPUAS::FLAT_ADDRESS:
4198   case AMDGPUAS::GLOBAL_ADDRESS:
4199   case AMDGPUAS::CONSTANT_ADDRESS:
4200   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4201     break;
4202   default:
4203     return SDValue();
4204   }
4205 
4206   return Op;
4207 }
4208 
4209 // Work around DAG legality rules only based on the result type.
4210 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4211   bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4212   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4213   EVT SrcVT = Src.getValueType();
4214 
4215   if (SrcVT.getScalarType() != MVT::bf16)
4216     return Op;
4217 
4218   SDLoc SL(Op);
4219   SDValue BitCast =
4220       DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4221 
4222   EVT DstVT = Op.getValueType();
4223   if (IsStrict)
4224     llvm_unreachable("Need STRICT_BF16_TO_FP");
4225 
4226   return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4227 }
4228 
4229 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4230   SDLoc SL(Op);
4231   if (Op.getValueType() != MVT::i64)
4232     return Op;
4233 
4234   uint32_t ModeHwReg =
4235       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4236   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4237   uint32_t TrapHwReg =
4238       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4239   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4240 
4241   SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4242   SDValue IntrinID =
4243       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4244   SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4245                                    Op.getOperand(0), IntrinID, ModeHwRegImm);
4246   SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4247                                    Op.getOperand(0), IntrinID, TrapHwRegImm);
4248   SDValue TokenReg =
4249       DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4250                   GetTrapReg.getValue(1));
4251 
4252   SDValue CvtPtr =
4253       DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4254   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4255 
4256   return DAG.getMergeValues({Result, TokenReg}, SL);
4257 }
4258 
4259 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4260   SDLoc SL(Op);
4261   if (Op.getOperand(1).getValueType() != MVT::i64)
4262     return Op;
4263 
4264   SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4265   SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4266                                    DAG.getConstant(0, SL, MVT::i32));
4267   SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4268                                    DAG.getConstant(1, SL, MVT::i32));
4269 
4270   SDValue ReadFirstLaneID =
4271       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4272   NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4273                            ReadFirstLaneID, NewModeReg);
4274   NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4275                            ReadFirstLaneID, NewTrapReg);
4276 
4277   unsigned ModeHwReg =
4278       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4279   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4280   unsigned TrapHwReg =
4281       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4282   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4283 
4284   SDValue IntrinID =
4285       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4286   SDValue SetModeReg =
4287       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4288                   IntrinID, ModeHwRegImm, NewModeReg);
4289   SDValue SetTrapReg =
4290       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4291                   IntrinID, TrapHwRegImm, NewTrapReg);
4292   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4293 }
4294 
4295 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
4296                                              const MachineFunction &MF) const {
4297   Register Reg = StringSwitch<Register>(RegName)
4298     .Case("m0", AMDGPU::M0)
4299     .Case("exec", AMDGPU::EXEC)
4300     .Case("exec_lo", AMDGPU::EXEC_LO)
4301     .Case("exec_hi", AMDGPU::EXEC_HI)
4302     .Case("flat_scratch", AMDGPU::FLAT_SCR)
4303     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4304     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4305     .Default(Register());
4306 
4307   if (Reg == AMDGPU::NoRegister) {
4308     report_fatal_error(Twine("invalid register name \""
4309                              + StringRef(RegName)  + "\"."));
4310 
4311   }
4312 
4313   if (!Subtarget->hasFlatScrRegister() &&
4314        Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4315     report_fatal_error(Twine("invalid register \""
4316                              + StringRef(RegName)  + "\" for subtarget."));
4317   }
4318 
4319   switch (Reg) {
4320   case AMDGPU::M0:
4321   case AMDGPU::EXEC_LO:
4322   case AMDGPU::EXEC_HI:
4323   case AMDGPU::FLAT_SCR_LO:
4324   case AMDGPU::FLAT_SCR_HI:
4325     if (VT.getSizeInBits() == 32)
4326       return Reg;
4327     break;
4328   case AMDGPU::EXEC:
4329   case AMDGPU::FLAT_SCR:
4330     if (VT.getSizeInBits() == 64)
4331       return Reg;
4332     break;
4333   default:
4334     llvm_unreachable("missing register type checking");
4335   }
4336 
4337   report_fatal_error(Twine("invalid type for register \""
4338                            + StringRef(RegName) + "\"."));
4339 }
4340 
4341 // If kill is not the last instruction, split the block so kill is always a
4342 // proper terminator.
4343 MachineBasicBlock *
4344 SITargetLowering::splitKillBlock(MachineInstr &MI,
4345                                  MachineBasicBlock *BB) const {
4346   MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4347   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4348   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4349   return SplitBB;
4350 }
4351 
4352 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4353 // \p MI will be the only instruction in the loop body block. Otherwise, it will
4354 // be the first instruction in the remainder block.
4355 //
4356 /// \returns { LoopBody, Remainder }
4357 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4358 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4359   MachineFunction *MF = MBB.getParent();
4360   MachineBasicBlock::iterator I(&MI);
4361 
4362   // To insert the loop we need to split the block. Move everything after this
4363   // point to a new block, and insert a new empty block between the two.
4364   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4365   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4366   MachineFunction::iterator MBBI(MBB);
4367   ++MBBI;
4368 
4369   MF->insert(MBBI, LoopBB);
4370   MF->insert(MBBI, RemainderBB);
4371 
4372   LoopBB->addSuccessor(LoopBB);
4373   LoopBB->addSuccessor(RemainderBB);
4374 
4375   // Move the rest of the block into a new block.
4376   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4377 
4378   if (InstInLoop) {
4379     auto Next = std::next(I);
4380 
4381     // Move instruction to loop body.
4382     LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4383 
4384     // Move the rest of the block.
4385     RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4386   } else {
4387     RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4388   }
4389 
4390   MBB.addSuccessor(LoopBB);
4391 
4392   return std::pair(LoopBB, RemainderBB);
4393 }
4394 
4395 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4396 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4397   MachineBasicBlock *MBB = MI.getParent();
4398   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4399   auto I = MI.getIterator();
4400   auto E = std::next(I);
4401 
4402   BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4403     .addImm(0);
4404 
4405   MIBundleBuilder Bundler(*MBB, I, E);
4406   finalizeBundle(*MBB, Bundler.begin());
4407 }
4408 
4409 MachineBasicBlock *
4410 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4411                                          MachineBasicBlock *BB) const {
4412   const DebugLoc &DL = MI.getDebugLoc();
4413 
4414   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4415 
4416   MachineBasicBlock *LoopBB;
4417   MachineBasicBlock *RemainderBB;
4418   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4419 
4420   // Apparently kill flags are only valid if the def is in the same block?
4421   if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4422     Src->setIsKill(false);
4423 
4424   std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4425 
4426   MachineBasicBlock::iterator I = LoopBB->end();
4427 
4428   const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4429       AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
4430 
4431   // Clear TRAP_STS.MEM_VIOL
4432   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4433     .addImm(0)
4434     .addImm(EncodedReg);
4435 
4436   bundleInstWithWaitcnt(MI);
4437 
4438   Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4439 
4440   // Load and check TRAP_STS.MEM_VIOL
4441   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4442     .addImm(EncodedReg);
4443 
4444   // FIXME: Do we need to use an isel pseudo that may clobber scc?
4445   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4446     .addReg(Reg, RegState::Kill)
4447     .addImm(0);
4448   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4449     .addMBB(LoopBB);
4450 
4451   return RemainderBB;
4452 }
4453 
4454 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4455 // wavefront. If the value is uniform and just happens to be in a VGPR, this
4456 // will only do one iteration. In the worst case, this will loop 64 times.
4457 //
4458 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4459 static MachineBasicBlock::iterator
4460 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4461                        MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4462                        const DebugLoc &DL, const MachineOperand &Idx,
4463                        unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4464                        unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4465                        Register &SGPRIdxReg) {
4466 
4467   MachineFunction *MF = OrigBB.getParent();
4468   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4469   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4470   MachineBasicBlock::iterator I = LoopBB.begin();
4471 
4472   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4473   Register PhiExec = MRI.createVirtualRegister(BoolRC);
4474   Register NewExec = MRI.createVirtualRegister(BoolRC);
4475   Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4476   Register CondReg = MRI.createVirtualRegister(BoolRC);
4477 
4478   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4479     .addReg(InitReg)
4480     .addMBB(&OrigBB)
4481     .addReg(ResultReg)
4482     .addMBB(&LoopBB);
4483 
4484   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4485     .addReg(InitSaveExecReg)
4486     .addMBB(&OrigBB)
4487     .addReg(NewExec)
4488     .addMBB(&LoopBB);
4489 
4490   // Read the next variant <- also loop target.
4491   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4492       .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4493 
4494   // Compare the just read M0 value to all possible Idx values.
4495   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4496       .addReg(CurrentIdxReg)
4497       .addReg(Idx.getReg(), 0, Idx.getSubReg());
4498 
4499   // Update EXEC, save the original EXEC value to VCC.
4500   BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4501                                                 : AMDGPU::S_AND_SAVEEXEC_B64),
4502           NewExec)
4503     .addReg(CondReg, RegState::Kill);
4504 
4505   MRI.setSimpleHint(NewExec, CondReg);
4506 
4507   if (UseGPRIdxMode) {
4508     if (Offset == 0) {
4509       SGPRIdxReg = CurrentIdxReg;
4510     } else {
4511       SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4512       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4513           .addReg(CurrentIdxReg, RegState::Kill)
4514           .addImm(Offset);
4515     }
4516   } else {
4517     // Move index from VCC into M0
4518     if (Offset == 0) {
4519       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4520         .addReg(CurrentIdxReg, RegState::Kill);
4521     } else {
4522       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4523         .addReg(CurrentIdxReg, RegState::Kill)
4524         .addImm(Offset);
4525     }
4526   }
4527 
4528   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4529   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4530   MachineInstr *InsertPt =
4531     BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4532                                                   : AMDGPU::S_XOR_B64_term), Exec)
4533       .addReg(Exec)
4534       .addReg(NewExec);
4535 
4536   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4537   // s_cbranch_scc0?
4538 
4539   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4540   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4541     .addMBB(&LoopBB);
4542 
4543   return InsertPt->getIterator();
4544 }
4545 
4546 // This has slightly sub-optimal regalloc when the source vector is killed by
4547 // the read. The register allocator does not understand that the kill is
4548 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4549 // subregister from it, using 1 more VGPR than necessary. This was saved when
4550 // this was expanded after register allocation.
4551 static MachineBasicBlock::iterator
4552 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4553                unsigned InitResultReg, unsigned PhiReg, int Offset,
4554                bool UseGPRIdxMode, Register &SGPRIdxReg) {
4555   MachineFunction *MF = MBB.getParent();
4556   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4557   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4558   MachineRegisterInfo &MRI = MF->getRegInfo();
4559   const DebugLoc &DL = MI.getDebugLoc();
4560   MachineBasicBlock::iterator I(&MI);
4561 
4562   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4563   Register DstReg = MI.getOperand(0).getReg();
4564   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4565   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4566   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4567   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4568 
4569   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4570 
4571   // Save the EXEC mask
4572   BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4573     .addReg(Exec);
4574 
4575   MachineBasicBlock *LoopBB;
4576   MachineBasicBlock *RemainderBB;
4577   std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4578 
4579   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4580 
4581   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4582                                       InitResultReg, DstReg, PhiReg, TmpExec,
4583                                       Offset, UseGPRIdxMode, SGPRIdxReg);
4584 
4585   MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4586   MachineFunction::iterator MBBI(LoopBB);
4587   ++MBBI;
4588   MF->insert(MBBI, LandingPad);
4589   LoopBB->removeSuccessor(RemainderBB);
4590   LandingPad->addSuccessor(RemainderBB);
4591   LoopBB->addSuccessor(LandingPad);
4592   MachineBasicBlock::iterator First = LandingPad->begin();
4593   BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4594     .addReg(SaveExec);
4595 
4596   return InsPt;
4597 }
4598 
4599 // Returns subreg index, offset
4600 static std::pair<unsigned, int>
4601 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4602                             const TargetRegisterClass *SuperRC,
4603                             unsigned VecReg,
4604                             int Offset) {
4605   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4606 
4607   // Skip out of bounds offsets, or else we would end up using an undefined
4608   // register.
4609   if (Offset >= NumElts || Offset < 0)
4610     return std::pair(AMDGPU::sub0, Offset);
4611 
4612   return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4613 }
4614 
4615 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4616                                  MachineRegisterInfo &MRI, MachineInstr &MI,
4617                                  int Offset) {
4618   MachineBasicBlock *MBB = MI.getParent();
4619   const DebugLoc &DL = MI.getDebugLoc();
4620   MachineBasicBlock::iterator I(&MI);
4621 
4622   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4623 
4624   assert(Idx->getReg() != AMDGPU::NoRegister);
4625 
4626   if (Offset == 0) {
4627     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4628   } else {
4629     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4630         .add(*Idx)
4631         .addImm(Offset);
4632   }
4633 }
4634 
4635 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4636                                    MachineRegisterInfo &MRI, MachineInstr &MI,
4637                                    int Offset) {
4638   MachineBasicBlock *MBB = MI.getParent();
4639   const DebugLoc &DL = MI.getDebugLoc();
4640   MachineBasicBlock::iterator I(&MI);
4641 
4642   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4643 
4644   if (Offset == 0)
4645     return Idx->getReg();
4646 
4647   Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4648   BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4649       .add(*Idx)
4650       .addImm(Offset);
4651   return Tmp;
4652 }
4653 
4654 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4655                                           MachineBasicBlock &MBB,
4656                                           const GCNSubtarget &ST) {
4657   const SIInstrInfo *TII = ST.getInstrInfo();
4658   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4659   MachineFunction *MF = MBB.getParent();
4660   MachineRegisterInfo &MRI = MF->getRegInfo();
4661 
4662   Register Dst = MI.getOperand(0).getReg();
4663   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4664   Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4665   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4666 
4667   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4668   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4669 
4670   unsigned SubReg;
4671   std::tie(SubReg, Offset)
4672     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4673 
4674   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4675 
4676   // Check for a SGPR index.
4677   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4678     MachineBasicBlock::iterator I(&MI);
4679     const DebugLoc &DL = MI.getDebugLoc();
4680 
4681     if (UseGPRIdxMode) {
4682       // TODO: Look at the uses to avoid the copy. This may require rescheduling
4683       // to avoid interfering with other uses, so probably requires a new
4684       // optimization pass.
4685       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4686 
4687       const MCInstrDesc &GPRIDXDesc =
4688           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4689       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4690           .addReg(SrcReg)
4691           .addReg(Idx)
4692           .addImm(SubReg);
4693     } else {
4694       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4695 
4696       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4697         .addReg(SrcReg, 0, SubReg)
4698         .addReg(SrcReg, RegState::Implicit);
4699     }
4700 
4701     MI.eraseFromParent();
4702 
4703     return &MBB;
4704   }
4705 
4706   // Control flow needs to be inserted if indexing with a VGPR.
4707   const DebugLoc &DL = MI.getDebugLoc();
4708   MachineBasicBlock::iterator I(&MI);
4709 
4710   Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4711   Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4712 
4713   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4714 
4715   Register SGPRIdxReg;
4716   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4717                               UseGPRIdxMode, SGPRIdxReg);
4718 
4719   MachineBasicBlock *LoopBB = InsPt->getParent();
4720 
4721   if (UseGPRIdxMode) {
4722     const MCInstrDesc &GPRIDXDesc =
4723         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4724 
4725     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4726         .addReg(SrcReg)
4727         .addReg(SGPRIdxReg)
4728         .addImm(SubReg);
4729   } else {
4730     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4731       .addReg(SrcReg, 0, SubReg)
4732       .addReg(SrcReg, RegState::Implicit);
4733   }
4734 
4735   MI.eraseFromParent();
4736 
4737   return LoopBB;
4738 }
4739 
4740 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4741                                           MachineBasicBlock &MBB,
4742                                           const GCNSubtarget &ST) {
4743   const SIInstrInfo *TII = ST.getInstrInfo();
4744   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4745   MachineFunction *MF = MBB.getParent();
4746   MachineRegisterInfo &MRI = MF->getRegInfo();
4747 
4748   Register Dst = MI.getOperand(0).getReg();
4749   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4750   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4751   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4752   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4753   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4754   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4755 
4756   // This can be an immediate, but will be folded later.
4757   assert(Val->getReg());
4758 
4759   unsigned SubReg;
4760   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4761                                                          SrcVec->getReg(),
4762                                                          Offset);
4763   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4764 
4765   if (Idx->getReg() == AMDGPU::NoRegister) {
4766     MachineBasicBlock::iterator I(&MI);
4767     const DebugLoc &DL = MI.getDebugLoc();
4768 
4769     assert(Offset == 0);
4770 
4771     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4772         .add(*SrcVec)
4773         .add(*Val)
4774         .addImm(SubReg);
4775 
4776     MI.eraseFromParent();
4777     return &MBB;
4778   }
4779 
4780   // Check for a SGPR index.
4781   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4782     MachineBasicBlock::iterator I(&MI);
4783     const DebugLoc &DL = MI.getDebugLoc();
4784 
4785     if (UseGPRIdxMode) {
4786       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4787 
4788       const MCInstrDesc &GPRIDXDesc =
4789           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4790       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4791           .addReg(SrcVec->getReg())
4792           .add(*Val)
4793           .addReg(Idx)
4794           .addImm(SubReg);
4795     } else {
4796       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4797 
4798       const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4799           TRI.getRegSizeInBits(*VecRC), 32, false);
4800       BuildMI(MBB, I, DL, MovRelDesc, Dst)
4801           .addReg(SrcVec->getReg())
4802           .add(*Val)
4803           .addImm(SubReg);
4804     }
4805     MI.eraseFromParent();
4806     return &MBB;
4807   }
4808 
4809   // Control flow needs to be inserted if indexing with a VGPR.
4810   if (Val->isReg())
4811     MRI.clearKillFlags(Val->getReg());
4812 
4813   const DebugLoc &DL = MI.getDebugLoc();
4814 
4815   Register PhiReg = MRI.createVirtualRegister(VecRC);
4816 
4817   Register SGPRIdxReg;
4818   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4819                               UseGPRIdxMode, SGPRIdxReg);
4820   MachineBasicBlock *LoopBB = InsPt->getParent();
4821 
4822   if (UseGPRIdxMode) {
4823     const MCInstrDesc &GPRIDXDesc =
4824         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4825 
4826     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4827         .addReg(PhiReg)
4828         .add(*Val)
4829         .addReg(SGPRIdxReg)
4830         .addImm(SubReg);
4831   } else {
4832     const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4833         TRI.getRegSizeInBits(*VecRC), 32, false);
4834     BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4835         .addReg(PhiReg)
4836         .add(*Val)
4837         .addImm(SubReg);
4838   }
4839 
4840   MI.eraseFromParent();
4841   return LoopBB;
4842 }
4843 
4844 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
4845                                           MachineBasicBlock &BB,
4846                                           const GCNSubtarget &ST,
4847                                           unsigned Opc) {
4848   MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4849   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4850   const DebugLoc &DL = MI.getDebugLoc();
4851   const SIInstrInfo *TII = ST.getInstrInfo();
4852 
4853   // Reduction operations depend on whether the input operand is SGPR or VGPR.
4854   Register SrcReg = MI.getOperand(1).getReg();
4855   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4856   Register DstReg = MI.getOperand(0).getReg();
4857   MachineBasicBlock *RetBB = nullptr;
4858   if (isSGPR) {
4859     // These operations with a uniform value i.e. SGPR are idempotent.
4860     // Reduced value will be same as given sgpr.
4861     BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4862     RetBB = &BB;
4863   } else {
4864     // TODO: Implement DPP Strategy and switch based on immediate strategy
4865     // operand. For now, for all the cases (default, Iterative and DPP we use
4866     // iterative approach by default.)
4867 
4868     // To reduce the VGPR using iterative approach, we need to iterate
4869     // over all the active lanes. Lowering consists of ComputeLoop,
4870     // which iterate over only active lanes. We use copy of EXEC register
4871     // as induction variable and every active lane modifies it using bitset0
4872     // so that we will get the next active lane for next iteration.
4873     MachineBasicBlock::iterator I = BB.end();
4874     Register SrcReg = MI.getOperand(1).getReg();
4875 
4876     // Create Control flow for loop
4877     // Split MI's Machine Basic block into For loop
4878     auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4879 
4880     // Create virtual registers required for lowering.
4881     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4882     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4883     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4884     Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4885 
4886     Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4887     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4888     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4889 
4890     Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4891     Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4892 
4893     bool IsWave32 = ST.isWave32();
4894     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4895     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4896 
4897     // Create initail values of induction variable from Exec, Accumulator and
4898     // insert branch instr to newly created ComputeBlockk
4899     uint32_t InitalValue =
4900         (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4901     auto TmpSReg =
4902         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4903     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4904         .addImm(InitalValue);
4905     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4906 
4907     // Start constructing ComputeLoop
4908     I = ComputeLoop->end();
4909     auto Accumulator =
4910         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4911             .addReg(InitalValReg)
4912             .addMBB(&BB);
4913     auto ActiveBits =
4914         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4915             .addReg(TmpSReg->getOperand(0).getReg())
4916             .addMBB(&BB);
4917 
4918     // Perform the computations
4919     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4920     auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4921                    .addReg(ActiveBits->getOperand(0).getReg());
4922     auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4923                              TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4924                          .addReg(SrcReg)
4925                          .addReg(FF1->getOperand(0).getReg());
4926     auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4927                               .addReg(Accumulator->getOperand(0).getReg())
4928                               .addReg(LaneValue->getOperand(0).getReg());
4929 
4930     // Manipulate the iterator to get the next active lane
4931     unsigned BITSETOpc =
4932         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4933     auto NewActiveBits =
4934         BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4935             .addReg(FF1->getOperand(0).getReg())
4936             .addReg(ActiveBits->getOperand(0).getReg());
4937 
4938     // Add phi nodes
4939     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4940         .addMBB(ComputeLoop);
4941     ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4942         .addMBB(ComputeLoop);
4943 
4944     // Creating branching
4945     unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4946     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4947         .addReg(NewActiveBits->getOperand(0).getReg())
4948         .addImm(0);
4949     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4950         .addMBB(ComputeLoop);
4951 
4952     RetBB = ComputeEnd;
4953   }
4954   MI.eraseFromParent();
4955   return RetBB;
4956 }
4957 
4958 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
4959   MachineInstr &MI, MachineBasicBlock *BB) const {
4960 
4961   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4962   MachineFunction *MF = BB->getParent();
4963   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4964 
4965   switch (MI.getOpcode()) {
4966   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4967     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4968   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4969     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4970   case AMDGPU::S_UADDO_PSEUDO:
4971   case AMDGPU::S_USUBO_PSEUDO: {
4972     const DebugLoc &DL = MI.getDebugLoc();
4973     MachineOperand &Dest0 = MI.getOperand(0);
4974     MachineOperand &Dest1 = MI.getOperand(1);
4975     MachineOperand &Src0 = MI.getOperand(2);
4976     MachineOperand &Src1 = MI.getOperand(3);
4977 
4978     unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4979                        ? AMDGPU::S_ADD_I32
4980                        : AMDGPU::S_SUB_I32;
4981     BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4982 
4983     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4984         .addImm(1)
4985         .addImm(0);
4986 
4987     MI.eraseFromParent();
4988     return BB;
4989   }
4990   case AMDGPU::S_ADD_U64_PSEUDO:
4991   case AMDGPU::S_SUB_U64_PSEUDO: {
4992     // For targets older than GFX12, we emit a sequence of 32-bit operations.
4993     // For GFX12, we emit s_add_u64 and s_sub_u64.
4994     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4995     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4996     const DebugLoc &DL = MI.getDebugLoc();
4997     MachineOperand &Dest = MI.getOperand(0);
4998     MachineOperand &Src0 = MI.getOperand(1);
4999     MachineOperand &Src1 = MI.getOperand(2);
5000     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5001     if (Subtarget->hasScalarAddSub64()) {
5002       unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5003       BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5004         .add(Src0)
5005         .add(Src1);
5006     } else {
5007       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5008       const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5009 
5010       Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5011       Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5012 
5013       MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5014           MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5015       MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5016           MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5017 
5018       MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5019           MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5020       MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5021           MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5022 
5023       unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5024       unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5025       BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5026           .add(Src0Sub0)
5027           .add(Src1Sub0);
5028       BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5029           .add(Src0Sub1)
5030           .add(Src1Sub1);
5031       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5032           .addReg(DestSub0)
5033           .addImm(AMDGPU::sub0)
5034           .addReg(DestSub1)
5035           .addImm(AMDGPU::sub1);
5036     }
5037     MI.eraseFromParent();
5038     return BB;
5039   }
5040   case AMDGPU::V_ADD_U64_PSEUDO:
5041   case AMDGPU::V_SUB_U64_PSEUDO: {
5042     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5043     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5044     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5045     const DebugLoc &DL = MI.getDebugLoc();
5046 
5047     bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5048 
5049     MachineOperand &Dest = MI.getOperand(0);
5050     MachineOperand &Src0 = MI.getOperand(1);
5051     MachineOperand &Src1 = MI.getOperand(2);
5052 
5053     if (IsAdd && ST.hasLshlAddB64()) {
5054       auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5055                          Dest.getReg())
5056                      .add(Src0)
5057                      .addImm(0)
5058                      .add(Src1);
5059       TII->legalizeOperands(*Add);
5060       MI.eraseFromParent();
5061       return BB;
5062     }
5063 
5064     const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5065 
5066     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5067     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5068 
5069     Register CarryReg = MRI.createVirtualRegister(CarryRC);
5070     Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5071 
5072     const TargetRegisterClass *Src0RC = Src0.isReg()
5073                                             ? MRI.getRegClass(Src0.getReg())
5074                                             : &AMDGPU::VReg_64RegClass;
5075     const TargetRegisterClass *Src1RC = Src1.isReg()
5076                                             ? MRI.getRegClass(Src1.getReg())
5077                                             : &AMDGPU::VReg_64RegClass;
5078 
5079     const TargetRegisterClass *Src0SubRC =
5080         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5081     const TargetRegisterClass *Src1SubRC =
5082         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5083 
5084     MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5085         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5086     MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5087         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5088 
5089     MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5090         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5091     MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5092         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5093 
5094     unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5095     MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5096                                .addReg(CarryReg, RegState::Define)
5097                                .add(SrcReg0Sub0)
5098                                .add(SrcReg1Sub0)
5099                                .addImm(0); // clamp bit
5100 
5101     unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5102     MachineInstr *HiHalf =
5103         BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5104             .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5105             .add(SrcReg0Sub1)
5106             .add(SrcReg1Sub1)
5107             .addReg(CarryReg, RegState::Kill)
5108             .addImm(0); // clamp bit
5109 
5110     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5111         .addReg(DestSub0)
5112         .addImm(AMDGPU::sub0)
5113         .addReg(DestSub1)
5114         .addImm(AMDGPU::sub1);
5115     TII->legalizeOperands(*LoHalf);
5116     TII->legalizeOperands(*HiHalf);
5117     MI.eraseFromParent();
5118     return BB;
5119   }
5120   case AMDGPU::S_ADD_CO_PSEUDO:
5121   case AMDGPU::S_SUB_CO_PSEUDO: {
5122     // This pseudo has a chance to be selected
5123     // only from uniform add/subcarry node. All the VGPR operands
5124     // therefore assumed to be splat vectors.
5125     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5126     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5127     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5128     MachineBasicBlock::iterator MII = MI;
5129     const DebugLoc &DL = MI.getDebugLoc();
5130     MachineOperand &Dest = MI.getOperand(0);
5131     MachineOperand &CarryDest = MI.getOperand(1);
5132     MachineOperand &Src0 = MI.getOperand(2);
5133     MachineOperand &Src1 = MI.getOperand(3);
5134     MachineOperand &Src2 = MI.getOperand(4);
5135     unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5136                        ? AMDGPU::S_ADDC_U32
5137                        : AMDGPU::S_SUBB_U32;
5138     if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5139       Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5140       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5141           .addReg(Src0.getReg());
5142       Src0.setReg(RegOp0);
5143     }
5144     if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5145       Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5146       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5147           .addReg(Src1.getReg());
5148       Src1.setReg(RegOp1);
5149     }
5150     Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5151     if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5152       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5153           .addReg(Src2.getReg());
5154       Src2.setReg(RegOp2);
5155     }
5156 
5157     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5158     unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5159     assert(WaveSize == 64 || WaveSize == 32);
5160 
5161     if (WaveSize == 64) {
5162       if (ST.hasScalarCompareEq64()) {
5163         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5164             .addReg(Src2.getReg())
5165             .addImm(0);
5166       } else {
5167         const TargetRegisterClass *SubRC =
5168             TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5169         MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5170             MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5171         MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5172             MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5173         Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5174 
5175         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5176             .add(Src2Sub0)
5177             .add(Src2Sub1);
5178 
5179         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5180             .addReg(Src2_32, RegState::Kill)
5181             .addImm(0);
5182       }
5183     } else {
5184       BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5185           .addReg(Src2.getReg())
5186           .addImm(0);
5187     }
5188 
5189     BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5190 
5191     unsigned SelOpc =
5192         (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5193 
5194     BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5195         .addImm(-1)
5196         .addImm(0);
5197 
5198     MI.eraseFromParent();
5199     return BB;
5200   }
5201   case AMDGPU::SI_INIT_M0: {
5202     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5203             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5204         .add(MI.getOperand(0));
5205     MI.eraseFromParent();
5206     return BB;
5207   }
5208   case AMDGPU::GET_GROUPSTATICSIZE: {
5209     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5210            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5211     DebugLoc DL = MI.getDebugLoc();
5212     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5213         .add(MI.getOperand(0))
5214         .addImm(MFI->getLDSSize());
5215     MI.eraseFromParent();
5216     return BB;
5217   }
5218   case AMDGPU::GET_SHADERCYCLESHILO: {
5219     assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5220     MachineRegisterInfo &MRI = MF->getRegInfo();
5221     const DebugLoc &DL = MI.getDebugLoc();
5222     // The algorithm is:
5223     //
5224     // hi1 = getreg(SHADER_CYCLES_HI)
5225     // lo1 = getreg(SHADER_CYCLES_LO)
5226     // hi2 = getreg(SHADER_CYCLES_HI)
5227     //
5228     // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5229     // Otherwise there was overflow and the result is hi2:0. In both cases the
5230     // result should represent the actual time at some point during the sequence
5231     // of three getregs.
5232     using namespace AMDGPU::Hwreg;
5233     Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5235         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5236     Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5238         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5239     Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5241         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5242     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5243         .addReg(RegHi1)
5244         .addReg(RegHi2);
5245     Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5246     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5247         .addReg(RegLo1)
5248         .addImm(0);
5249     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5250         .add(MI.getOperand(0))
5251         .addReg(RegLo)
5252         .addImm(AMDGPU::sub0)
5253         .addReg(RegHi2)
5254         .addImm(AMDGPU::sub1);
5255     MI.eraseFromParent();
5256     return BB;
5257   }
5258   case AMDGPU::SI_INDIRECT_SRC_V1:
5259   case AMDGPU::SI_INDIRECT_SRC_V2:
5260   case AMDGPU::SI_INDIRECT_SRC_V4:
5261   case AMDGPU::SI_INDIRECT_SRC_V8:
5262   case AMDGPU::SI_INDIRECT_SRC_V9:
5263   case AMDGPU::SI_INDIRECT_SRC_V10:
5264   case AMDGPU::SI_INDIRECT_SRC_V11:
5265   case AMDGPU::SI_INDIRECT_SRC_V12:
5266   case AMDGPU::SI_INDIRECT_SRC_V16:
5267   case AMDGPU::SI_INDIRECT_SRC_V32:
5268     return emitIndirectSrc(MI, *BB, *getSubtarget());
5269   case AMDGPU::SI_INDIRECT_DST_V1:
5270   case AMDGPU::SI_INDIRECT_DST_V2:
5271   case AMDGPU::SI_INDIRECT_DST_V4:
5272   case AMDGPU::SI_INDIRECT_DST_V8:
5273   case AMDGPU::SI_INDIRECT_DST_V9:
5274   case AMDGPU::SI_INDIRECT_DST_V10:
5275   case AMDGPU::SI_INDIRECT_DST_V11:
5276   case AMDGPU::SI_INDIRECT_DST_V12:
5277   case AMDGPU::SI_INDIRECT_DST_V16:
5278   case AMDGPU::SI_INDIRECT_DST_V32:
5279     return emitIndirectDst(MI, *BB, *getSubtarget());
5280   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5281   case AMDGPU::SI_KILL_I1_PSEUDO:
5282     return splitKillBlock(MI, BB);
5283   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5284     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5285     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5286     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5287 
5288     Register Dst = MI.getOperand(0).getReg();
5289     const MachineOperand &Src0 = MI.getOperand(1);
5290     const MachineOperand &Src1 = MI.getOperand(2);
5291     const DebugLoc &DL = MI.getDebugLoc();
5292     Register SrcCond = MI.getOperand(3).getReg();
5293 
5294     Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5295     Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5296     const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5297     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5298 
5299     const TargetRegisterClass *Src0RC = Src0.isReg()
5300                                             ? MRI.getRegClass(Src0.getReg())
5301                                             : &AMDGPU::VReg_64RegClass;
5302     const TargetRegisterClass *Src1RC = Src1.isReg()
5303                                             ? MRI.getRegClass(Src1.getReg())
5304                                             : &AMDGPU::VReg_64RegClass;
5305 
5306     const TargetRegisterClass *Src0SubRC =
5307         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5308     const TargetRegisterClass *Src1SubRC =
5309         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5310 
5311     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5312         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5313     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5314         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5315 
5316     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5317         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5318     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5319         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5320 
5321     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5322       .addReg(SrcCond);
5323     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5324         .addImm(0)
5325         .add(Src0Sub0)
5326         .addImm(0)
5327         .add(Src1Sub0)
5328         .addReg(SrcCondCopy);
5329     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5330         .addImm(0)
5331         .add(Src0Sub1)
5332         .addImm(0)
5333         .add(Src1Sub1)
5334         .addReg(SrcCondCopy);
5335 
5336     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5337       .addReg(DstLo)
5338       .addImm(AMDGPU::sub0)
5339       .addReg(DstHi)
5340       .addImm(AMDGPU::sub1);
5341     MI.eraseFromParent();
5342     return BB;
5343   }
5344   case AMDGPU::SI_BR_UNDEF: {
5345     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5346     const DebugLoc &DL = MI.getDebugLoc();
5347     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5348                            .add(MI.getOperand(0));
5349     Br->getOperand(1).setIsUndef(); // read undef SCC
5350     MI.eraseFromParent();
5351     return BB;
5352   }
5353   case AMDGPU::ADJCALLSTACKUP:
5354   case AMDGPU::ADJCALLSTACKDOWN: {
5355     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5356     MachineInstrBuilder MIB(*MF, &MI);
5357     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5358        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5359     return BB;
5360   }
5361   case AMDGPU::SI_CALL_ISEL: {
5362     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5363     const DebugLoc &DL = MI.getDebugLoc();
5364 
5365     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5366 
5367     MachineInstrBuilder MIB;
5368     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5369 
5370     for (const MachineOperand &MO : MI.operands())
5371       MIB.add(MO);
5372 
5373     MIB.cloneMemRefs(MI);
5374     MI.eraseFromParent();
5375     return BB;
5376   }
5377   case AMDGPU::V_ADD_CO_U32_e32:
5378   case AMDGPU::V_SUB_CO_U32_e32:
5379   case AMDGPU::V_SUBREV_CO_U32_e32: {
5380     // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5381     const DebugLoc &DL = MI.getDebugLoc();
5382     unsigned Opc = MI.getOpcode();
5383 
5384     bool NeedClampOperand = false;
5385     if (TII->pseudoToMCOpcode(Opc) == -1) {
5386       Opc = AMDGPU::getVOPe64(Opc);
5387       NeedClampOperand = true;
5388     }
5389 
5390     auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5391     if (TII->isVOP3(*I)) {
5392       const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5393       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5394       I.addReg(TRI->getVCC(), RegState::Define);
5395     }
5396     I.add(MI.getOperand(1))
5397      .add(MI.getOperand(2));
5398     if (NeedClampOperand)
5399       I.addImm(0); // clamp bit for e64 encoding
5400 
5401     TII->legalizeOperands(*I);
5402 
5403     MI.eraseFromParent();
5404     return BB;
5405   }
5406   case AMDGPU::V_ADDC_U32_e32:
5407   case AMDGPU::V_SUBB_U32_e32:
5408   case AMDGPU::V_SUBBREV_U32_e32:
5409     // These instructions have an implicit use of vcc which counts towards the
5410     // constant bus limit.
5411     TII->legalizeOperands(MI);
5412     return BB;
5413   case AMDGPU::DS_GWS_INIT:
5414   case AMDGPU::DS_GWS_SEMA_BR:
5415   case AMDGPU::DS_GWS_BARRIER:
5416     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5417     [[fallthrough]];
5418   case AMDGPU::DS_GWS_SEMA_V:
5419   case AMDGPU::DS_GWS_SEMA_P:
5420   case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5421     // A s_waitcnt 0 is required to be the instruction immediately following.
5422     if (getSubtarget()->hasGWSAutoReplay()) {
5423       bundleInstWithWaitcnt(MI);
5424       return BB;
5425     }
5426 
5427     return emitGWSMemViolTestLoop(MI, BB);
5428   case AMDGPU::S_SETREG_B32: {
5429     // Try to optimize cases that only set the denormal mode or rounding mode.
5430     //
5431     // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5432     // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5433     // instead.
5434     //
5435     // FIXME: This could be predicates on the immediate, but tablegen doesn't
5436     // allow you to have a no side effect instruction in the output of a
5437     // sideeffecting pattern.
5438     auto [ID, Offset, Width] =
5439         AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5440     if (ID != AMDGPU::Hwreg::ID_MODE)
5441       return BB;
5442 
5443     const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5444     const unsigned SetMask = WidthMask << Offset;
5445 
5446     if (getSubtarget()->hasDenormModeInst()) {
5447       unsigned SetDenormOp = 0;
5448       unsigned SetRoundOp = 0;
5449 
5450       // The dedicated instructions can only set the whole denorm or round mode
5451       // at once, not a subset of bits in either.
5452       if (SetMask ==
5453           (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5454         // If this fully sets both the round and denorm mode, emit the two
5455         // dedicated instructions for these.
5456         SetRoundOp = AMDGPU::S_ROUND_MODE;
5457         SetDenormOp = AMDGPU::S_DENORM_MODE;
5458       } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5459         SetRoundOp = AMDGPU::S_ROUND_MODE;
5460       } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5461         SetDenormOp = AMDGPU::S_DENORM_MODE;
5462       }
5463 
5464       if (SetRoundOp || SetDenormOp) {
5465         MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5466         MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5467         if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5468           unsigned ImmVal = Def->getOperand(1).getImm();
5469           if (SetRoundOp) {
5470             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5471                 .addImm(ImmVal & 0xf);
5472 
5473             // If we also have the denorm mode, get just the denorm mode bits.
5474             ImmVal >>= 4;
5475           }
5476 
5477           if (SetDenormOp) {
5478             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5479                 .addImm(ImmVal & 0xf);
5480           }
5481 
5482           MI.eraseFromParent();
5483           return BB;
5484         }
5485       }
5486     }
5487 
5488     // If only FP bits are touched, used the no side effects pseudo.
5489     if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5490                     AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5491       MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5492 
5493     return BB;
5494   }
5495   case AMDGPU::S_INVERSE_BALLOT_U32:
5496   case AMDGPU::S_INVERSE_BALLOT_U64:
5497     // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5498     // necessary. After that they are equivalent to a COPY.
5499     MI.setDesc(TII->get(AMDGPU::COPY));
5500     return BB;
5501   case AMDGPU::ENDPGM_TRAP: {
5502     const DebugLoc &DL = MI.getDebugLoc();
5503     if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5504       MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5505       MI.addOperand(MachineOperand::CreateImm(0));
5506       return BB;
5507     }
5508 
5509     // We need a block split to make the real endpgm a terminator. We also don't
5510     // want to break phis in successor blocks, so we can't just delete to the
5511     // end of the block.
5512 
5513     MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5514     MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5515     MF->push_back(TrapBB);
5516     BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5517       .addImm(0);
5518     BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5519       .addMBB(TrapBB);
5520 
5521     BB->addSuccessor(TrapBB);
5522     MI.eraseFromParent();
5523     return SplitBB;
5524   }
5525   case AMDGPU::SIMULATED_TRAP: {
5526     assert(Subtarget->hasPrivEnabledTrap2NopBug());
5527     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5528     MachineBasicBlock *SplitBB =
5529         TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5530     MI.eraseFromParent();
5531     return SplitBB;
5532   }
5533   default:
5534     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5535       if (!MI.mayStore())
5536         AddMemOpInit(MI);
5537       return BB;
5538     }
5539     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
5540   }
5541 }
5542 
5543 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5544   // This currently forces unfolding various combinations of fsub into fma with
5545   // free fneg'd operands. As long as we have fast FMA (controlled by
5546   // isFMAFasterThanFMulAndFAdd), we should perform these.
5547 
5548   // When fma is quarter rate, for f64 where add / sub are at best half rate,
5549   // most of these combines appear to be cycle neutral but save on instruction
5550   // count / code size.
5551   return true;
5552 }
5553 
5554 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5555 
5556 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5557                                          EVT VT) const {
5558   if (!VT.isVector()) {
5559     return MVT::i1;
5560   }
5561   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5562 }
5563 
5564 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5565   // TODO: Should i16 be used always if legal? For now it would force VALU
5566   // shifts.
5567   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5568 }
5569 
5570 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5571   return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5572              ? Ty.changeElementSize(16)
5573              : Ty.changeElementSize(32);
5574 }
5575 
5576 // Answering this is somewhat tricky and depends on the specific device which
5577 // have different rates for fma or all f64 operations.
5578 //
5579 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5580 // regardless of which device (although the number of cycles differs between
5581 // devices), so it is always profitable for f64.
5582 //
5583 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5584 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
5585 // which we can always do even without fused FP ops since it returns the same
5586 // result as the separate operations and since it is always full
5587 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5588 // however does not support denormals, so we do report fma as faster if we have
5589 // a fast fma device and require denormals.
5590 //
5591 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5592                                                   EVT VT) const {
5593   VT = VT.getScalarType();
5594 
5595   switch (VT.getSimpleVT().SimpleTy) {
5596   case MVT::f32: {
5597     // If mad is not available this depends only on if f32 fma is full rate.
5598     if (!Subtarget->hasMadMacF32Insts())
5599       return Subtarget->hasFastFMAF32();
5600 
5601     // Otherwise f32 mad is always full rate and returns the same result as
5602     // the separate operations so should be preferred over fma.
5603     // However does not support denormals.
5604     if (!denormalModeIsFlushAllF32(MF))
5605       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5606 
5607     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5608     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5609   }
5610   case MVT::f64:
5611     return true;
5612   case MVT::f16:
5613     return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5614   default:
5615     break;
5616   }
5617 
5618   return false;
5619 }
5620 
5621 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5622                                                   LLT Ty) const {
5623   switch (Ty.getScalarSizeInBits()) {
5624   case 16:
5625     return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5626   case 32:
5627     return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5628   case 64:
5629     return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5630   default:
5631     break;
5632   }
5633 
5634   return false;
5635 }
5636 
5637 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5638   if (!Ty.isScalar())
5639     return false;
5640 
5641   if (Ty.getScalarSizeInBits() == 16)
5642     return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5643   if (Ty.getScalarSizeInBits() == 32)
5644     return Subtarget->hasMadMacF32Insts() &&
5645            denormalModeIsFlushAllF32(*MI.getMF());
5646 
5647   return false;
5648 }
5649 
5650 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
5651                                    const SDNode *N) const {
5652   // TODO: Check future ftz flag
5653   // v_mad_f32/v_mac_f32 do not support denormals.
5654   EVT VT = N->getValueType(0);
5655   if (VT == MVT::f32)
5656     return Subtarget->hasMadMacF32Insts() &&
5657            denormalModeIsFlushAllF32(DAG.getMachineFunction());
5658   if (VT == MVT::f16) {
5659     return Subtarget->hasMadF16() &&
5660            denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
5661   }
5662 
5663   return false;
5664 }
5665 
5666 //===----------------------------------------------------------------------===//
5667 // Custom DAG Lowering Operations
5668 //===----------------------------------------------------------------------===//
5669 
5670 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5671 // wider vector type is legal.
5672 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
5673                                              SelectionDAG &DAG) const {
5674   unsigned Opc = Op.getOpcode();
5675   EVT VT = Op.getValueType();
5676   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5677          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5678          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5679          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5680 
5681   SDValue Lo, Hi;
5682   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5683 
5684   SDLoc SL(Op);
5685   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5686                              Op->getFlags());
5687   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5688                              Op->getFlags());
5689 
5690   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5691 }
5692 
5693 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5694 // wider vector type is legal.
5695 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
5696                                               SelectionDAG &DAG) const {
5697   unsigned Opc = Op.getOpcode();
5698   EVT VT = Op.getValueType();
5699   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5700          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5701          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5702          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5703 
5704   SDValue Lo0, Hi0;
5705   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5706   SDValue Lo1, Hi1;
5707   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5708 
5709   SDLoc SL(Op);
5710 
5711   SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5712                              Op->getFlags());
5713   SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5714                              Op->getFlags());
5715 
5716   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5717 }
5718 
5719 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
5720                                               SelectionDAG &DAG) const {
5721   unsigned Opc = Op.getOpcode();
5722   EVT VT = Op.getValueType();
5723   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5724          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5725          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5726          VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5727          VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5728          VT == MVT::v32bf16);
5729 
5730   SDValue Lo0, Hi0;
5731   SDValue Op0 = Op.getOperand(0);
5732   std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5733                            ? DAG.SplitVectorOperand(Op.getNode(), 0)
5734                            : std::pair(Op0, Op0);
5735   SDValue Lo1, Hi1;
5736   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5737   SDValue Lo2, Hi2;
5738   std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5739 
5740   SDLoc SL(Op);
5741   auto ResVT = DAG.GetSplitDestVTs(VT);
5742 
5743   SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5744                              Op->getFlags());
5745   SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5746                              Op->getFlags());
5747 
5748   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5749 }
5750 
5751 
5752 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5753   switch (Op.getOpcode()) {
5754   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5755   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5756   case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5757   case ISD::LOAD: {
5758     SDValue Result = LowerLOAD(Op, DAG);
5759     assert((!Result.getNode() ||
5760             Result.getNode()->getNumValues() == 2) &&
5761            "Load should return a value and a chain");
5762     return Result;
5763   }
5764   case ISD::FSQRT: {
5765     EVT VT = Op.getValueType();
5766     if (VT == MVT::f32)
5767       return lowerFSQRTF32(Op, DAG);
5768     if (VT == MVT::f64)
5769       return lowerFSQRTF64(Op, DAG);
5770     return SDValue();
5771   }
5772   case ISD::FSIN:
5773   case ISD::FCOS:
5774     return LowerTrig(Op, DAG);
5775   case ISD::SELECT: return LowerSELECT(Op, DAG);
5776   case ISD::FDIV: return LowerFDIV(Op, DAG);
5777   case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5778   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5779   case ISD::STORE: return LowerSTORE(Op, DAG);
5780   case ISD::GlobalAddress: {
5781     MachineFunction &MF = DAG.getMachineFunction();
5782     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5783     return LowerGlobalAddress(MFI, Op, DAG);
5784   }
5785   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5786   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5787   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5788   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5789   case ISD::INSERT_SUBVECTOR:
5790     return lowerINSERT_SUBVECTOR(Op, DAG);
5791   case ISD::INSERT_VECTOR_ELT:
5792     return lowerINSERT_VECTOR_ELT(Op, DAG);
5793   case ISD::EXTRACT_VECTOR_ELT:
5794     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5795   case ISD::VECTOR_SHUFFLE:
5796     return lowerVECTOR_SHUFFLE(Op, DAG);
5797   case ISD::SCALAR_TO_VECTOR:
5798     return lowerSCALAR_TO_VECTOR(Op, DAG);
5799   case ISD::BUILD_VECTOR:
5800     return lowerBUILD_VECTOR(Op, DAG);
5801   case ISD::FP_ROUND:
5802   case ISD::STRICT_FP_ROUND:
5803     return lowerFP_ROUND(Op, DAG);
5804   case ISD::FPTRUNC_ROUND: {
5805     unsigned Opc;
5806     SDLoc DL(Op);
5807 
5808     if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5809       return SDValue();
5810 
5811     // Get the rounding mode from the last operand
5812     int RoundMode = Op.getConstantOperandVal(1);
5813     if (RoundMode == (int)RoundingMode::TowardPositive)
5814       Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
5815     else if (RoundMode == (int)RoundingMode::TowardNegative)
5816       Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
5817     else
5818       return SDValue();
5819 
5820     return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5821   }
5822   case ISD::TRAP:
5823     return lowerTRAP(Op, DAG);
5824   case ISD::DEBUGTRAP:
5825     return lowerDEBUGTRAP(Op, DAG);
5826   case ISD::ABS:
5827   case ISD::FABS:
5828   case ISD::FNEG:
5829   case ISD::FCANONICALIZE:
5830   case ISD::BSWAP:
5831     return splitUnaryVectorOp(Op, DAG);
5832   case ISD::FMINNUM:
5833   case ISD::FMAXNUM:
5834     return lowerFMINNUM_FMAXNUM(Op, DAG);
5835   case ISD::FLDEXP:
5836   case ISD::STRICT_FLDEXP:
5837     return lowerFLDEXP(Op, DAG);
5838   case ISD::FMA:
5839     return splitTernaryVectorOp(Op, DAG);
5840   case ISD::FP_TO_SINT:
5841   case ISD::FP_TO_UINT:
5842     return LowerFP_TO_INT(Op, DAG);
5843   case ISD::SHL:
5844   case ISD::SRA:
5845   case ISD::SRL:
5846   case ISD::ADD:
5847   case ISD::SUB:
5848   case ISD::SMIN:
5849   case ISD::SMAX:
5850   case ISD::UMIN:
5851   case ISD::UMAX:
5852   case ISD::FADD:
5853   case ISD::FMUL:
5854   case ISD::FMINNUM_IEEE:
5855   case ISD::FMAXNUM_IEEE:
5856   case ISD::FMINIMUM:
5857   case ISD::FMAXIMUM:
5858   case ISD::UADDSAT:
5859   case ISD::USUBSAT:
5860   case ISD::SADDSAT:
5861   case ISD::SSUBSAT:
5862     return splitBinaryVectorOp(Op, DAG);
5863   case ISD::MUL:
5864     return lowerMUL(Op, DAG);
5865   case ISD::SMULO:
5866   case ISD::UMULO:
5867     return lowerXMULO(Op, DAG);
5868   case ISD::SMUL_LOHI:
5869   case ISD::UMUL_LOHI:
5870     return lowerXMUL_LOHI(Op, DAG);
5871   case ISD::DYNAMIC_STACKALLOC:
5872     return LowerDYNAMIC_STACKALLOC(Op, DAG);
5873   case ISD::STACKSAVE:
5874     return LowerSTACKSAVE(Op, DAG);
5875   case ISD::GET_ROUNDING:
5876     return lowerGET_ROUNDING(Op, DAG);
5877   case ISD::SET_ROUNDING:
5878     return lowerSET_ROUNDING(Op, DAG);
5879   case ISD::PREFETCH:
5880     return lowerPREFETCH(Op, DAG);
5881   case ISD::FP_EXTEND:
5882   case ISD::STRICT_FP_EXTEND:
5883     return lowerFP_EXTEND(Op, DAG);
5884   case ISD::GET_FPENV:
5885     return lowerGET_FPENV(Op, DAG);
5886   case ISD::SET_FPENV:
5887     return lowerSET_FPENV(Op, DAG);
5888   }
5889   return SDValue();
5890 }
5891 
5892 // Used for D16: Casts the result of an instruction into the right vector,
5893 // packs values if loads return unpacked values.
5894 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
5895                                        const SDLoc &DL,
5896                                        SelectionDAG &DAG, bool Unpacked) {
5897   if (!LoadVT.isVector())
5898     return Result;
5899 
5900   // Cast back to the original packed type or to a larger type that is a
5901   // multiple of 32 bit for D16. Widening the return type is a required for
5902   // legalization.
5903   EVT FittingLoadVT = LoadVT;
5904   if ((LoadVT.getVectorNumElements() % 2) == 1) {
5905     FittingLoadVT =
5906         EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
5907                          LoadVT.getVectorNumElements() + 1);
5908   }
5909 
5910   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5911     // Truncate to v2i16/v4i16.
5912     EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5913 
5914     // Workaround legalizer not scalarizing truncate after vector op
5915     // legalization but not creating intermediate vector trunc.
5916     SmallVector<SDValue, 4> Elts;
5917     DAG.ExtractVectorElements(Result, Elts);
5918     for (SDValue &Elt : Elts)
5919       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5920 
5921     // Pad illegal v1i16/v3fi6 to v4i16
5922     if ((LoadVT.getVectorNumElements() % 2) == 1)
5923       Elts.push_back(DAG.getUNDEF(MVT::i16));
5924 
5925     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5926 
5927     // Bitcast to original type (v2f16/v4f16).
5928     return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5929   }
5930 
5931   // Cast back to the original packed type.
5932   return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5933 }
5934 
5935 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5936                                               MemSDNode *M,
5937                                               SelectionDAG &DAG,
5938                                               ArrayRef<SDValue> Ops,
5939                                               bool IsIntrinsic) const {
5940   SDLoc DL(M);
5941 
5942   bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943   EVT LoadVT = M->getValueType(0);
5944 
5945   EVT EquivLoadVT = LoadVT;
5946   if (LoadVT.isVector()) {
5947     if (Unpacked) {
5948       EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5949                                      LoadVT.getVectorNumElements());
5950     } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5951       // Widen v3f16 to legal type
5952       EquivLoadVT =
5953           EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
5954                            LoadVT.getVectorNumElements() + 1);
5955     }
5956   }
5957 
5958   // Change from v4f16/v2f16 to EquivLoadVT.
5959   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5960 
5961   SDValue Load
5962     = DAG.getMemIntrinsicNode(
5963       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5964       VTList, Ops, M->getMemoryVT(),
5965       M->getMemOperand());
5966 
5967   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5968 
5969   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5970 }
5971 
5972 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5973                                              SelectionDAG &DAG,
5974                                              ArrayRef<SDValue> Ops) const {
5975   SDLoc DL(M);
5976   EVT LoadVT = M->getValueType(0);
5977   EVT EltType = LoadVT.getScalarType();
5978   EVT IntVT = LoadVT.changeTypeToInteger();
5979 
5980   bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5981 
5982   assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983   bool IsTFE = M->getNumValues() == 3;
5984 
5985   unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5986                                    : AMDGPUISD::BUFFER_LOAD_FORMAT)
5987                  : IsTFE  ? AMDGPUISD::BUFFER_LOAD_TFE
5988                           : AMDGPUISD::BUFFER_LOAD;
5989 
5990   if (IsD16) {
5991     return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5992   }
5993 
5994   // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5995   if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5996     return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5997                                       IsTFE);
5998 
5999   if (isTypeLegal(LoadVT)) {
6000     return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6001                                M->getMemOperand(), DAG);
6002   }
6003 
6004   EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6005   SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6006   SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6007                                         M->getMemOperand(), DAG);
6008   return DAG.getMergeValues(
6009       {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6010       DL);
6011 }
6012 
6013 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
6014                                   SDNode *N, SelectionDAG &DAG) {
6015   EVT VT = N->getValueType(0);
6016   unsigned CondCode = N->getConstantOperandVal(3);
6017   if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6018     return DAG.getUNDEF(VT);
6019 
6020   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6021 
6022   SDValue LHS = N->getOperand(1);
6023   SDValue RHS = N->getOperand(2);
6024 
6025   SDLoc DL(N);
6026 
6027   EVT CmpVT = LHS.getValueType();
6028   if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6029     unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6030       ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6031     LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6032     RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6033   }
6034 
6035   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6036 
6037   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6038   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6039 
6040   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6041                               DAG.getCondCode(CCOpcode));
6042   if (VT.bitsEq(CCVT))
6043     return SetCC;
6044   return DAG.getZExtOrTrunc(SetCC, DL, VT);
6045 }
6046 
6047 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
6048                                   SDNode *N, SelectionDAG &DAG) {
6049   EVT VT = N->getValueType(0);
6050 
6051   unsigned CondCode = N->getConstantOperandVal(3);
6052   if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6053     return DAG.getUNDEF(VT);
6054 
6055   SDValue Src0 = N->getOperand(1);
6056   SDValue Src1 = N->getOperand(2);
6057   EVT CmpVT = Src0.getValueType();
6058   SDLoc SL(N);
6059 
6060   if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6061     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6062     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6063   }
6064 
6065   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6066   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6067   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6068   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6069   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6070                               Src1, DAG.getCondCode(CCOpcode));
6071   if (VT.bitsEq(CCVT))
6072     return SetCC;
6073   return DAG.getZExtOrTrunc(SetCC, SL, VT);
6074 }
6075 
6076 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6077                                     SelectionDAG &DAG) {
6078   EVT VT = N->getValueType(0);
6079   SDValue Src = N->getOperand(1);
6080   SDLoc SL(N);
6081 
6082   if (Src.getOpcode() == ISD::SETCC) {
6083     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6084     return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6085                        Src.getOperand(1), Src.getOperand(2));
6086   }
6087   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6088     // (ballot 0) -> 0
6089     if (Arg->isZero())
6090       return DAG.getConstant(0, SL, VT);
6091 
6092     // (ballot 1) -> EXEC/EXEC_LO
6093     if (Arg->isOne()) {
6094       Register Exec;
6095       if (VT.getScalarSizeInBits() == 32)
6096         Exec = AMDGPU::EXEC_LO;
6097       else if (VT.getScalarSizeInBits() == 64)
6098         Exec = AMDGPU::EXEC;
6099       else
6100         return SDValue();
6101 
6102       return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6103     }
6104   }
6105 
6106   // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6107   // ISD::SETNE)
6108   return DAG.getNode(
6109       AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6110       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6111 }
6112 
6113 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6114                            SelectionDAG &DAG) {
6115   EVT VT = N->getValueType(0);
6116   unsigned ValSize = VT.getSizeInBits();
6117   unsigned IID = N->getConstantOperandVal(0);
6118   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6119                       IID == Intrinsic::amdgcn_permlanex16;
6120   SDLoc SL(N);
6121   MVT IntVT = MVT::getIntegerVT(ValSize);
6122 
6123   auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6124                                           SDValue Src2, MVT ValT) -> SDValue {
6125     SmallVector<SDValue, 8> Operands;
6126     switch (IID) {
6127     case Intrinsic::amdgcn_permlane16:
6128     case Intrinsic::amdgcn_permlanex16:
6129       Operands.push_back(N->getOperand(6));
6130       Operands.push_back(N->getOperand(5));
6131       Operands.push_back(N->getOperand(4));
6132       [[fallthrough]];
6133     case Intrinsic::amdgcn_writelane:
6134       Operands.push_back(Src2);
6135       [[fallthrough]];
6136     case Intrinsic::amdgcn_readlane:
6137       Operands.push_back(Src1);
6138       [[fallthrough]];
6139     case Intrinsic::amdgcn_readfirstlane:
6140     case Intrinsic::amdgcn_permlane64:
6141       Operands.push_back(Src0);
6142       break;
6143     default:
6144       llvm_unreachable("unhandled lane op");
6145     }
6146 
6147     Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6148     std::reverse(Operands.begin(), Operands.end());
6149 
6150     if (SDNode *GL = N->getGluedNode()) {
6151       assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152       GL = GL->getOperand(0).getNode();
6153       Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6154                                      SDValue(GL, 0)));
6155     }
6156 
6157     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6158   };
6159 
6160   SDValue Src0 = N->getOperand(1);
6161   SDValue Src1, Src2;
6162   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6163       IsPermLane16) {
6164     Src1 = N->getOperand(2);
6165     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6166       Src2 = N->getOperand(3);
6167   }
6168 
6169   if (ValSize == 32) {
6170     // Already legal
6171     return SDValue();
6172   }
6173 
6174   if (ValSize < 32) {
6175     bool IsFloat = VT.isFloatingPoint();
6176     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6177                                 SL, MVT::i32);
6178 
6179     if (IsPermLane16) {
6180       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6181                                   SL, MVT::i32);
6182     }
6183 
6184     if (IID == Intrinsic::amdgcn_writelane) {
6185       Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6186                                   SL, MVT::i32);
6187     }
6188 
6189     SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6190     SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6191     return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6192   }
6193 
6194   if (ValSize % 32 != 0)
6195     return SDValue();
6196 
6197   auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198     EVT VT = N->getValueType(0);
6199     unsigned NE = VT.getVectorNumElements();
6200     EVT EltVT = VT.getVectorElementType();
6201     SmallVector<SDValue, 8> Scalars;
6202     unsigned NumOperands = N->getNumOperands();
6203     SmallVector<SDValue, 4> Operands(NumOperands);
6204     SDNode *GL = N->getGluedNode();
6205 
6206     // only handle convergencectrl_glue
6207     assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6208 
6209     for (unsigned i = 0; i != NE; ++i) {
6210       for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6211            ++j) {
6212         SDValue Operand = N->getOperand(j);
6213         EVT OperandVT = Operand.getValueType();
6214         if (OperandVT.isVector()) {
6215           // A vector operand; extract a single element.
6216           EVT OperandEltVT = OperandVT.getVectorElementType();
6217           Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6218                                     Operand, DAG.getVectorIdxConstant(i, SL));
6219         } else {
6220           // A scalar operand; just use it as is.
6221           Operands[j] = Operand;
6222         }
6223       }
6224 
6225       if (GL)
6226         Operands[NumOperands - 1] =
6227             DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6228                         SDValue(GL->getOperand(0).getNode(), 0));
6229 
6230       Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6231     }
6232 
6233     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6234     return DAG.getBuildVector(VecVT, SL, Scalars);
6235   };
6236 
6237   if (VT.isVector()) {
6238     switch (MVT::SimpleValueType EltTy =
6239                 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6240     case MVT::i32:
6241     case MVT::f32: {
6242       SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6243       return unrollLaneOp(LaneOp.getNode());
6244     }
6245     case MVT::i16:
6246     case MVT::f16:
6247     case MVT::bf16: {
6248       MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6249       SmallVector<SDValue, 4> Pieces;
6250       SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6251       for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6252         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6253                                  DAG.getConstant(EltIdx, SL, MVT::i32));
6254 
6255         if (IsPermLane16)
6256           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6257                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6258 
6259         if (IID == Intrinsic::amdgcn_writelane)
6260           Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6261                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6262 
6263         Pieces.push_back(
6264             IsPermLane16
6265                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6266                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267         EltIdx += 2;
6268       }
6269       return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6270     }
6271     default:
6272       // Handle all other cases by bitcasting to i32 vectors
6273       break;
6274     }
6275   }
6276 
6277   MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6278   Src0 = DAG.getBitcast(VecVT, Src0);
6279 
6280   if (IsPermLane16)
6281     Src1 = DAG.getBitcast(VecVT, Src1);
6282 
6283   if (IID == Intrinsic::amdgcn_writelane)
6284     Src2 = DAG.getBitcast(VecVT, Src2);
6285 
6286   SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6287   SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6288   return DAG.getBitcast(VT, UnrolledLaneOp);
6289 }
6290 
6291 void SITargetLowering::ReplaceNodeResults(SDNode *N,
6292                                           SmallVectorImpl<SDValue> &Results,
6293                                           SelectionDAG &DAG) const {
6294   switch (N->getOpcode()) {
6295   case ISD::INSERT_VECTOR_ELT: {
6296     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6297       Results.push_back(Res);
6298     return;
6299   }
6300   case ISD::EXTRACT_VECTOR_ELT: {
6301     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6302       Results.push_back(Res);
6303     return;
6304   }
6305   case ISD::INTRINSIC_WO_CHAIN: {
6306     unsigned IID = N->getConstantOperandVal(0);
6307     switch (IID) {
6308     case Intrinsic::amdgcn_make_buffer_rsrc:
6309       Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6310       return;
6311     case Intrinsic::amdgcn_cvt_pkrtz: {
6312       SDValue Src0 = N->getOperand(1);
6313       SDValue Src1 = N->getOperand(2);
6314       SDLoc SL(N);
6315       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6316                                 Src0, Src1);
6317       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6318       return;
6319     }
6320     case Intrinsic::amdgcn_cvt_pknorm_i16:
6321     case Intrinsic::amdgcn_cvt_pknorm_u16:
6322     case Intrinsic::amdgcn_cvt_pk_i16:
6323     case Intrinsic::amdgcn_cvt_pk_u16: {
6324       SDValue Src0 = N->getOperand(1);
6325       SDValue Src1 = N->getOperand(2);
6326       SDLoc SL(N);
6327       unsigned Opcode;
6328 
6329       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6330         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6331       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6332         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6333       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6334         Opcode = AMDGPUISD::CVT_PK_I16_I32;
6335       else
6336         Opcode = AMDGPUISD::CVT_PK_U16_U32;
6337 
6338       EVT VT = N->getValueType(0);
6339       if (isTypeLegal(VT))
6340         Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6341       else {
6342         SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6343         Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6344       }
6345       return;
6346     }
6347     case Intrinsic::amdgcn_s_buffer_load: {
6348       // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6349       // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6350       // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6351       // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6352       // s_buffer_load_i8.
6353       if (!Subtarget->hasScalarSubwordLoads())
6354         return;
6355       SDValue Op = SDValue(N, 0);
6356       SDValue Rsrc = Op.getOperand(1);
6357       SDValue Offset = Op.getOperand(2);
6358       SDValue CachePolicy = Op.getOperand(3);
6359       EVT VT = Op.getValueType();
6360       assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6361       SDLoc DL(Op);
6362       MachineFunction &MF = DAG.getMachineFunction();
6363       const DataLayout &DataLayout = DAG.getDataLayout();
6364       Align Alignment =
6365           DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6366       MachineMemOperand *MMO = MF.getMachineMemOperand(
6367           MachinePointerInfo(),
6368           MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6369               MachineMemOperand::MOInvariant,
6370           VT.getStoreSize(), Alignment);
6371       SDValue LoadVal;
6372       if (!Offset->isDivergent()) {
6373         SDValue Ops[] = {Rsrc, // source register
6374                          Offset, CachePolicy};
6375         SDValue BufferLoad =
6376             DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
6377                                     DAG.getVTList(MVT::i32), Ops, VT, MMO);
6378         LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6379       } else {
6380         SDValue Ops[] = {
6381             DAG.getEntryNode(),                    // Chain
6382             Rsrc,                                  // rsrc
6383             DAG.getConstant(0, DL, MVT::i32),      // vindex
6384             {},                                    // voffset
6385             {},                                    // soffset
6386             {},                                    // offset
6387             CachePolicy,                           // cachepolicy
6388             DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6389         };
6390         setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6391         LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6392       }
6393       Results.push_back(LoadVal);
6394       return;
6395     }
6396     }
6397     break;
6398   }
6399   case ISD::INTRINSIC_W_CHAIN: {
6400     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6401       if (Res.getOpcode() == ISD::MERGE_VALUES) {
6402         // FIXME: Hacky
6403         for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6404           Results.push_back(Res.getOperand(I));
6405         }
6406       } else {
6407         Results.push_back(Res);
6408         Results.push_back(Res.getValue(1));
6409       }
6410       return;
6411     }
6412 
6413     break;
6414   }
6415   case ISD::SELECT: {
6416     SDLoc SL(N);
6417     EVT VT = N->getValueType(0);
6418     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6419     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6420     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6421 
6422     EVT SelectVT = NewVT;
6423     if (NewVT.bitsLT(MVT::i32)) {
6424       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6425       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6426       SelectVT = MVT::i32;
6427     }
6428 
6429     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6430                                     N->getOperand(0), LHS, RHS);
6431 
6432     if (NewVT != SelectVT)
6433       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6434     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6435     return;
6436   }
6437   case ISD::FNEG: {
6438     if (N->getValueType(0) != MVT::v2f16)
6439       break;
6440 
6441     SDLoc SL(N);
6442     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6443 
6444     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6445                              BC,
6446                              DAG.getConstant(0x80008000, SL, MVT::i32));
6447     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6448     return;
6449   }
6450   case ISD::FABS: {
6451     if (N->getValueType(0) != MVT::v2f16)
6452       break;
6453 
6454     SDLoc SL(N);
6455     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6456 
6457     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6458                              BC,
6459                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6460     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6461     return;
6462   }
6463   case ISD::FSQRT: {
6464     if (N->getValueType(0) != MVT::f16)
6465       break;
6466     Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6467     break;
6468   }
6469   default:
6470     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6471     break;
6472   }
6473 }
6474 
6475 /// Helper function for LowerBRCOND
6476 static SDNode *findUser(SDValue Value, unsigned Opcode) {
6477 
6478   SDNode *Parent = Value.getNode();
6479   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6480        I != E; ++I) {
6481 
6482     if (I.getUse().get() != Value)
6483       continue;
6484 
6485     if (I->getOpcode() == Opcode)
6486       return *I;
6487   }
6488   return nullptr;
6489 }
6490 
6491 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6492   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493     switch (Intr->getConstantOperandVal(1)) {
6494     case Intrinsic::amdgcn_if:
6495       return AMDGPUISD::IF;
6496     case Intrinsic::amdgcn_else:
6497       return AMDGPUISD::ELSE;
6498     case Intrinsic::amdgcn_loop:
6499       return AMDGPUISD::LOOP;
6500     case Intrinsic::amdgcn_end_cf:
6501       llvm_unreachable("should not occur");
6502     default:
6503       return 0;
6504     }
6505   }
6506 
6507   // break, if_break, else_break are all only used as inputs to loop, not
6508   // directly as branch conditions.
6509   return 0;
6510 }
6511 
6512 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6513   const Triple &TT = getTargetMachine().getTargetTriple();
6514   return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6515           GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6516          AMDGPU::shouldEmitConstantsToTextSection(TT);
6517 }
6518 
6519 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6520   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6521     return false;
6522 
6523   // FIXME: Either avoid relying on address space here or change the default
6524   // address space for functions to avoid the explicit check.
6525   return (GV->getValueType()->isFunctionTy() ||
6526           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6527          !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6528 }
6529 
6530 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6531   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6532 }
6533 
6534 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6535   if (!GV->hasExternalLinkage())
6536     return true;
6537 
6538   const auto OS = getTargetMachine().getTargetTriple().getOS();
6539   return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6540 }
6541 
6542 /// This transforms the control flow intrinsics to get the branch destination as
6543 /// last parameter, also switches branch target with BR if the need arise
6544 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6545                                       SelectionDAG &DAG) const {
6546   SDLoc DL(BRCOND);
6547 
6548   SDNode *Intr = BRCOND.getOperand(1).getNode();
6549   SDValue Target = BRCOND.getOperand(2);
6550   SDNode *BR = nullptr;
6551   SDNode *SetCC = nullptr;
6552 
6553   if (Intr->getOpcode() == ISD::SETCC) {
6554     // As long as we negate the condition everything is fine
6555     SetCC = Intr;
6556     Intr = SetCC->getOperand(0).getNode();
6557 
6558   } else {
6559     // Get the target from BR if we don't negate the condition
6560     BR = findUser(BRCOND, ISD::BR);
6561     assert(BR && "brcond missing unconditional branch user");
6562     Target = BR->getOperand(1);
6563   }
6564 
6565   unsigned CFNode = isCFIntrinsic(Intr);
6566   if (CFNode == 0) {
6567     // This is a uniform branch so we don't need to legalize.
6568     return BRCOND;
6569   }
6570 
6571   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6573 
6574   assert(!SetCC ||
6575         (SetCC->getConstantOperandVal(1) == 1 &&
6576          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6577                                                              ISD::SETNE));
6578 
6579   // operands of the new intrinsic call
6580   SmallVector<SDValue, 4> Ops;
6581   if (HaveChain)
6582     Ops.push_back(BRCOND.getOperand(0));
6583 
6584   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
6585   Ops.push_back(Target);
6586 
6587   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6588 
6589   // build the new intrinsic call
6590   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6591 
6592   if (!HaveChain) {
6593     SDValue Ops[] =  {
6594       SDValue(Result, 0),
6595       BRCOND.getOperand(0)
6596     };
6597 
6598     Result = DAG.getMergeValues(Ops, DL).getNode();
6599   }
6600 
6601   if (BR) {
6602     // Give the branch instruction our target
6603     SDValue Ops[] = {
6604       BR->getOperand(0),
6605       BRCOND.getOperand(2)
6606     };
6607     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6608     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6609   }
6610 
6611   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6612 
6613   // Copy the intrinsic results to registers
6614   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6615     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
6616     if (!CopyToReg)
6617       continue;
6618 
6619     Chain = DAG.getCopyToReg(
6620       Chain, DL,
6621       CopyToReg->getOperand(1),
6622       SDValue(Result, i - 1),
6623       SDValue());
6624 
6625     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6626   }
6627 
6628   // Remove the old intrinsic from the chain
6629   DAG.ReplaceAllUsesOfValueWith(
6630     SDValue(Intr, Intr->getNumValues() - 1),
6631     Intr->getOperand(0));
6632 
6633   return Chain;
6634 }
6635 
6636 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6637                                           SelectionDAG &DAG) const {
6638   MVT VT = Op.getSimpleValueType();
6639   SDLoc DL(Op);
6640   // Checking the depth
6641   if (Op.getConstantOperandVal(0) != 0)
6642     return DAG.getConstant(0, DL, VT);
6643 
6644   MachineFunction &MF = DAG.getMachineFunction();
6645   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6646   // Check for kernel and shader functions
6647   if (Info->isEntryFunction())
6648     return DAG.getConstant(0, DL, VT);
6649 
6650   MachineFrameInfo &MFI = MF.getFrameInfo();
6651   // There is a call to @llvm.returnaddress in this function
6652   MFI.setReturnAddressIsTaken(true);
6653 
6654   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6655   // Get the return address reg and mark it as an implicit live-in
6656   Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6657 
6658   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6659 }
6660 
6661 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6662                                             SDValue Op,
6663                                             const SDLoc &DL,
6664                                             EVT VT) const {
6665   return Op.getValueType().bitsLE(VT) ?
6666       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6667     DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6668                 DAG.getTargetConstant(0, DL, MVT::i32));
6669 }
6670 
6671 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6672   assert(Op.getValueType() == MVT::f16 &&
6673          "Do not know how to custom lower FP_ROUND for non-f16 type");
6674 
6675   SDValue Src = Op.getOperand(0);
6676   EVT SrcVT = Src.getValueType();
6677   if (SrcVT != MVT::f64)
6678     return Op;
6679 
6680   // TODO: Handle strictfp
6681   if (Op.getOpcode() != ISD::FP_ROUND)
6682     return Op;
6683 
6684   SDLoc DL(Op);
6685 
6686   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6687   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6688   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6689 }
6690 
6691 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6692                                                SelectionDAG &DAG) const {
6693   EVT VT = Op.getValueType();
6694   const MachineFunction &MF = DAG.getMachineFunction();
6695   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6696   bool IsIEEEMode = Info->getMode().IEEE;
6697 
6698   // FIXME: Assert during selection that this is only selected for
6699   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6700   // mode functions, but this happens to be OK since it's only done in cases
6701   // where there is known no sNaN.
6702   if (IsIEEEMode)
6703     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6704 
6705   if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6706       VT == MVT::v16bf16)
6707     return splitBinaryVectorOp(Op, DAG);
6708   return Op;
6709 }
6710 
6711 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6712   bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6713   EVT VT = Op.getValueType();
6714   assert(VT == MVT::f16);
6715 
6716   SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6717   EVT ExpVT = Exp.getValueType();
6718   if (ExpVT == MVT::i16)
6719     return Op;
6720 
6721   SDLoc DL(Op);
6722 
6723   // Correct the exponent type for f16 to i16.
6724   // Clamp the range of the exponent to the instruction's range.
6725 
6726   // TODO: This should be a generic narrowing legalization, and can easily be
6727   // for GlobalISel.
6728 
6729   SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6730   SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6731 
6732   SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6733   SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6734 
6735   SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6736 
6737   if (IsStrict) {
6738     return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6739                        {Op.getOperand(0), Op.getOperand(1), TruncExp});
6740   }
6741 
6742   return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6743 }
6744 
6745 // Custom lowering for vector multiplications and s_mul_u64.
6746 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6747   EVT VT = Op.getValueType();
6748 
6749   // Split vector operands.
6750   if (VT.isVector())
6751     return splitBinaryVectorOp(Op, DAG);
6752 
6753   assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6754 
6755   // There are four ways to lower s_mul_u64:
6756   //
6757   // 1. If all the operands are uniform, then we lower it as it is.
6758   //
6759   // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6760   //    multiplications because there is not a vector equivalent of s_mul_u64.
6761   //
6762   // 3. If the cost model decides that it is more efficient to use vector
6763   //    registers, then we have to split s_mul_u64 in 32-bit multiplications.
6764   //    This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6765   //
6766   // 4. If the cost model decides to use vector registers and both of the
6767   //    operands are zero-extended/sign-extended from 32-bits, then we split the
6768   //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769   //    possible to check if the operands are zero-extended or sign-extended in
6770   //    SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6771   //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772   //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6773   //    If the cost model decides that we have to use vector registers, then
6774   //    splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6775   //    s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6776   //    decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6777   //    s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6778   //    SIInstrInfo.cpp .
6779 
6780   if (Op->isDivergent())
6781     return SDValue();
6782 
6783   SDValue Op0 = Op.getOperand(0);
6784   SDValue Op1 = Op.getOperand(1);
6785   // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6788   KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6789   unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6790   KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6791   unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6792   SDLoc SL(Op);
6793   if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6794     return SDValue(
6795         DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6796   unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6797   unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6798   if (Op0SignBits >= 33 && Op1SignBits >= 33)
6799     return SDValue(
6800         DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6801   // If all the operands are uniform, then we lower s_mul_u64 as it is.
6802   return Op;
6803 }
6804 
6805 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6806   EVT VT = Op.getValueType();
6807   SDLoc SL(Op);
6808   SDValue LHS = Op.getOperand(0);
6809   SDValue RHS = Op.getOperand(1);
6810   bool isSigned = Op.getOpcode() == ISD::SMULO;
6811 
6812   if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6813     const APInt &C = RHSC->getAPIntValue();
6814     // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6815     if (C.isPowerOf2()) {
6816       // smulo(x, signed_min) is same as umulo(x, signed_min).
6817       bool UseArithShift = isSigned && !C.isMinSignedValue();
6818       SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6819       SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6820       SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6821           DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6822                       SL, VT, Result, ShiftAmt),
6823           LHS, ISD::SETNE);
6824       return DAG.getMergeValues({ Result, Overflow }, SL);
6825     }
6826   }
6827 
6828   SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6829   SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
6830                             SL, VT, LHS, RHS);
6831 
6832   SDValue Sign = isSigned
6833     ? DAG.getNode(ISD::SRA, SL, VT, Result,
6834                   DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6835     : DAG.getConstant(0, SL, VT);
6836   SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6837 
6838   return DAG.getMergeValues({ Result, Overflow }, SL);
6839 }
6840 
6841 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6842   if (Op->isDivergent()) {
6843     // Select to V_MAD_[IU]64_[IU]32.
6844     return Op;
6845   }
6846   if (Subtarget->hasSMulHi()) {
6847     // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6848     return SDValue();
6849   }
6850   // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6851   // calculate the high part, so we might as well do the whole thing with
6852   // V_MAD_[IU]64_[IU]32.
6853   return Op;
6854 }
6855 
6856 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6857   if (!Subtarget->isTrapHandlerEnabled() ||
6858       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6859     return lowerTrapEndpgm(Op, DAG);
6860 
6861   return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6862          lowerTrapHsaQueuePtr(Op, DAG);
6863 }
6864 
6865 SDValue SITargetLowering::lowerTrapEndpgm(
6866     SDValue Op, SelectionDAG &DAG) const {
6867   SDLoc SL(Op);
6868   SDValue Chain = Op.getOperand(0);
6869   return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6870 }
6871 
6872 SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6873     const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6874   MachineFunction &MF = DAG.getMachineFunction();
6875   uint64_t Offset = getImplicitParameterOffset(MF, Param);
6876   SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6877   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6878   return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6879                      MachineMemOperand::MODereferenceable |
6880                          MachineMemOperand::MOInvariant);
6881 }
6882 
6883 SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6884     SDValue Op, SelectionDAG &DAG) const {
6885   SDLoc SL(Op);
6886   SDValue Chain = Op.getOperand(0);
6887 
6888   SDValue QueuePtr;
6889   // For code object version 5, QueuePtr is passed through implicit kernarg.
6890   const Module *M = DAG.getMachineFunction().getFunction().getParent();
6891   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
6892     QueuePtr =
6893         loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6894   } else {
6895     MachineFunction &MF = DAG.getMachineFunction();
6896     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6897     Register UserSGPR = Info->getQueuePtrUserSGPR();
6898 
6899     if (UserSGPR == AMDGPU::NoRegister) {
6900       // We probably are in a function incorrectly marked with
6901       // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6902       // trap, so just use a null pointer.
6903       QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6904     } else {
6905       QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6906                                       MVT::i64);
6907     }
6908   }
6909 
6910   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6911   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6912                                    QueuePtr, SDValue());
6913 
6914   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6915   SDValue Ops[] = {
6916     ToReg,
6917     DAG.getTargetConstant(TrapID, SL, MVT::i16),
6918     SGPR01,
6919     ToReg.getValue(1)
6920   };
6921   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6922 }
6923 
6924 SDValue SITargetLowering::lowerTrapHsa(
6925     SDValue Op, SelectionDAG &DAG) const {
6926   SDLoc SL(Op);
6927   SDValue Chain = Op.getOperand(0);
6928 
6929   // We need to simulate the 's_trap 2' instruction on targets that run in
6930   // PRIV=1 (where it is treated as a nop).
6931   if (Subtarget->hasPrivEnabledTrap2NopBug())
6932     return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6933 
6934   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6935   SDValue Ops[] = {
6936     Chain,
6937     DAG.getTargetConstant(TrapID, SL, MVT::i16)
6938   };
6939   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6940 }
6941 
6942 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6943   SDLoc SL(Op);
6944   SDValue Chain = Op.getOperand(0);
6945   MachineFunction &MF = DAG.getMachineFunction();
6946 
6947   if (!Subtarget->isTrapHandlerEnabled() ||
6948       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6949     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
6950                                      "debugtrap handler not supported",
6951                                      Op.getDebugLoc(),
6952                                      DS_Warning);
6953     LLVMContext &Ctx = MF.getFunction().getContext();
6954     Ctx.diagnose(NoTrap);
6955     return Chain;
6956   }
6957 
6958   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
6959   SDValue Ops[] = {
6960     Chain,
6961     DAG.getTargetConstant(TrapID, SL, MVT::i16)
6962   };
6963   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6964 }
6965 
6966 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6967                                              SelectionDAG &DAG) const {
6968   if (Subtarget->hasApertureRegs()) {
6969     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6970                                        ? AMDGPU::SRC_SHARED_BASE
6971                                        : AMDGPU::SRC_PRIVATE_BASE;
6972     // Note: this feature (register) is broken. When used as a 32-bit operand,
6973     // it returns a wrong value (all zeroes?). The real value is in the upper 32
6974     // bits.
6975     //
6976     // To work around the issue, directly emit a 64 bit mov from this register
6977     // then extract the high bits. Note that this shouldn't even result in a
6978     // shift being emitted and simply become a pair of registers (e.g.):
6979     //    s_mov_b64 s[6:7], src_shared_base
6980     //    v_mov_b32_e32 v1, s7
6981     //
6982     // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6983     // coalescing would kick in and it would think it's okay to use the "HI"
6984     // subregister directly (instead of extracting the HI 32 bits) which is an
6985     // artificial (unusable) register.
6986     //  Register TableGen definitions would need an overhaul to get rid of the
6987     //  artificial "HI" aperture registers and prevent this kind of issue from
6988     //  happening.
6989     SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6990                                      DAG.getRegister(ApertureRegNo, MVT::i64));
6991     return DAG.getNode(
6992         ISD::TRUNCATE, DL, MVT::i32,
6993         DAG.getNode(ISD::SRL, DL, MVT::i64,
6994                     {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6995   }
6996 
6997   // For code object version 5, private_base and shared_base are passed through
6998   // implicit kernargs.
6999   const Module *M = DAG.getMachineFunction().getFunction().getParent();
7000   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7001     ImplicitParameter Param =
7002         (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7003     return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7004   }
7005 
7006   MachineFunction &MF = DAG.getMachineFunction();
7007   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7008   Register UserSGPR = Info->getQueuePtrUserSGPR();
7009   if (UserSGPR == AMDGPU::NoRegister) {
7010     // We probably are in a function incorrectly marked with
7011     // amdgpu-no-queue-ptr. This is undefined.
7012     return DAG.getUNDEF(MVT::i32);
7013   }
7014 
7015   SDValue QueuePtr = CreateLiveInRegister(
7016     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7017 
7018   // Offset into amd_queue_t for group_segment_aperture_base_hi /
7019   // private_segment_aperture_base_hi.
7020   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7021 
7022   SDValue Ptr =
7023       DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7024 
7025   // TODO: Use custom target PseudoSourceValue.
7026   // TODO: We should use the value from the IR intrinsic call, but it might not
7027   // be available and how do we get it?
7028   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7029   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7030                      commonAlignment(Align(64), StructOffset),
7031                      MachineMemOperand::MODereferenceable |
7032                          MachineMemOperand::MOInvariant);
7033 }
7034 
7035 /// Return true if the value is a known valid address, such that a null check is
7036 /// not necessary.
7037 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7038                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7039   if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7040       isa<BasicBlockSDNode>(Val))
7041     return true;
7042 
7043   if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7044     return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7045 
7046   // TODO: Search through arithmetic, handle arguments and loads
7047   // marked nonnull.
7048   return false;
7049 }
7050 
7051 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7052                                              SelectionDAG &DAG) const {
7053   SDLoc SL(Op);
7054 
7055   const AMDGPUTargetMachine &TM =
7056     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7057 
7058   unsigned DestAS, SrcAS;
7059   SDValue Src;
7060   bool IsNonNull = false;
7061   if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7062     SrcAS = ASC->getSrcAddressSpace();
7063     Src = ASC->getOperand(0);
7064     DestAS = ASC->getDestAddressSpace();
7065   } else {
7066     assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7067            Op.getConstantOperandVal(0) ==
7068                Intrinsic::amdgcn_addrspacecast_nonnull);
7069     Src = Op->getOperand(1);
7070     SrcAS = Op->getConstantOperandVal(2);
7071     DestAS = Op->getConstantOperandVal(3);
7072     IsNonNull = true;
7073   }
7074 
7075   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7076 
7077   // flat -> local/private
7078   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7079     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7080         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7081       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7082 
7083       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7084         return Ptr;
7085 
7086       unsigned NullVal = TM.getNullPointerValue(DestAS);
7087       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7088       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7089 
7090       return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7091                          SegmentNullPtr);
7092     }
7093   }
7094 
7095   // local/private -> flat
7096   if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7097     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7098         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7099 
7100       SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7101       SDValue CvtPtr =
7102           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7103       CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7104 
7105       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7106         return CvtPtr;
7107 
7108       unsigned NullVal = TM.getNullPointerValue(SrcAS);
7109       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7110 
7111       SDValue NonNull
7112         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7113 
7114       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7115                          FlatNullPtr);
7116     }
7117   }
7118 
7119   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7120       Op.getValueType() == MVT::i64) {
7121     const SIMachineFunctionInfo *Info =
7122         DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7123     SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7124     SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7125     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7126   }
7127 
7128   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7129       Src.getValueType() == MVT::i64)
7130     return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7131 
7132   // global <-> flat are no-ops and never emitted.
7133 
7134   const MachineFunction &MF = DAG.getMachineFunction();
7135   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7136     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7137   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7138 
7139   return DAG.getUNDEF(Op->getValueType(0));
7140 }
7141 
7142 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7143 // the small vector and inserting them into the big vector. That is better than
7144 // the default expansion of doing it via a stack slot. Even though the use of
7145 // the stack slot would be optimized away afterwards, the stack slot itself
7146 // remains.
7147 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7148                                                 SelectionDAG &DAG) const {
7149   SDValue Vec = Op.getOperand(0);
7150   SDValue Ins = Op.getOperand(1);
7151   SDValue Idx = Op.getOperand(2);
7152   EVT VecVT = Vec.getValueType();
7153   EVT InsVT = Ins.getValueType();
7154   EVT EltVT = VecVT.getVectorElementType();
7155   unsigned InsNumElts = InsVT.getVectorNumElements();
7156   unsigned IdxVal = Idx->getAsZExtVal();
7157   SDLoc SL(Op);
7158 
7159   if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7160     // Insert 32-bit registers at a time.
7161     assert(InsNumElts % 2 == 0 && "expect legal vector types");
7162 
7163     unsigned VecNumElts = VecVT.getVectorNumElements();
7164     EVT NewVecVT =
7165         EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7166     EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7167                                    : EVT::getVectorVT(*DAG.getContext(),
7168                                                       MVT::i32, InsNumElts / 2);
7169 
7170     Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7171     Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7172 
7173     for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7174       SDValue Elt;
7175       if (InsNumElts == 2) {
7176         Elt = Ins;
7177       } else {
7178         Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7179                           DAG.getConstant(I, SL, MVT::i32));
7180       }
7181       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7182                         DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7183     }
7184 
7185     return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7186   }
7187 
7188   for (unsigned I = 0; I != InsNumElts; ++I) {
7189     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7190                               DAG.getConstant(I, SL, MVT::i32));
7191     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7192                       DAG.getConstant(IdxVal + I, SL, MVT::i32));
7193   }
7194   return Vec;
7195 }
7196 
7197 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7198                                                  SelectionDAG &DAG) const {
7199   SDValue Vec = Op.getOperand(0);
7200   SDValue InsVal = Op.getOperand(1);
7201   SDValue Idx = Op.getOperand(2);
7202   EVT VecVT = Vec.getValueType();
7203   EVT EltVT = VecVT.getVectorElementType();
7204   unsigned VecSize = VecVT.getSizeInBits();
7205   unsigned EltSize = EltVT.getSizeInBits();
7206   SDLoc SL(Op);
7207 
7208   // Specially handle the case of v4i16 with static indexing.
7209   unsigned NumElts = VecVT.getVectorNumElements();
7210   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7211   if (NumElts == 4 && EltSize == 16 && KIdx) {
7212     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7213 
7214     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7215                                  DAG.getConstant(0, SL, MVT::i32));
7216     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7217                                  DAG.getConstant(1, SL, MVT::i32));
7218 
7219     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7220     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7221 
7222     unsigned Idx = KIdx->getZExtValue();
7223     bool InsertLo = Idx < 2;
7224     SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7225       InsertLo ? LoVec : HiVec,
7226       DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7227       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7228 
7229     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7230 
7231     SDValue Concat = InsertLo ?
7232       DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7233       DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7234 
7235     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7236   }
7237 
7238   // Static indexing does not lower to stack access, and hence there is no need
7239   // for special custom lowering to avoid stack access.
7240   if (isa<ConstantSDNode>(Idx))
7241     return SDValue();
7242 
7243   // Avoid stack access for dynamic indexing by custom lowering to
7244   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7245 
7246   assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7247 
7248   MVT IntVT = MVT::getIntegerVT(VecSize);
7249 
7250   // Convert vector index to bit-index and get the required bit mask.
7251   assert(isPowerOf2_32(EltSize));
7252   const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7253   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7254   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7255   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7256                             DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7257 
7258   // 1. Create a congruent vector with the target value in each element.
7259   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7260                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
7261 
7262   // 2. Mask off all other indices except the required index within (1).
7263   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7264 
7265   // 3. Mask off the required index within the target vector.
7266   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7267   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7268                             DAG.getNOT(SL, BFM, IntVT), BCVec);
7269 
7270   // 4. Get (2) and (3) ORed into the target vector.
7271   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7272 
7273   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7274 }
7275 
7276 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7277                                                   SelectionDAG &DAG) const {
7278   SDLoc SL(Op);
7279 
7280   EVT ResultVT = Op.getValueType();
7281   SDValue Vec = Op.getOperand(0);
7282   SDValue Idx = Op.getOperand(1);
7283   EVT VecVT = Vec.getValueType();
7284   unsigned VecSize = VecVT.getSizeInBits();
7285   EVT EltVT = VecVT.getVectorElementType();
7286 
7287   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7288 
7289   // Make sure we do any optimizations that will make it easier to fold
7290   // source modifiers before obscuring it with bit operations.
7291 
7292   // XXX - Why doesn't this get called when vector_shuffle is expanded?
7293   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7294     return Combined;
7295 
7296   if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7297     SDValue Lo, Hi;
7298     EVT LoVT, HiVT;
7299     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7300 
7301     if (VecSize == 128) {
7302       SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7303       Lo = DAG.getBitcast(LoVT,
7304                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7305                                       DAG.getConstant(0, SL, MVT::i32)));
7306       Hi = DAG.getBitcast(HiVT,
7307                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7308                                       DAG.getConstant(1, SL, MVT::i32)));
7309     } else if (VecSize == 256) {
7310       SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7311       SDValue Parts[4];
7312       for (unsigned P = 0; P < 4; ++P) {
7313         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7314                                DAG.getConstant(P, SL, MVT::i32));
7315       }
7316 
7317       Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7318                                             Parts[0], Parts[1]));
7319       Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7320                                             Parts[2], Parts[3]));
7321     } else {
7322       assert(VecSize == 512);
7323 
7324       SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7325       SDValue Parts[8];
7326       for (unsigned P = 0; P < 8; ++P) {
7327         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7328                                DAG.getConstant(P, SL, MVT::i32));
7329       }
7330 
7331       Lo = DAG.getBitcast(LoVT,
7332                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7333                                       Parts[0], Parts[1], Parts[2], Parts[3]));
7334       Hi = DAG.getBitcast(HiVT,
7335                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7336                                       Parts[4], Parts[5],Parts[6], Parts[7]));
7337     }
7338 
7339     EVT IdxVT = Idx.getValueType();
7340     unsigned NElem = VecVT.getVectorNumElements();
7341     assert(isPowerOf2_32(NElem));
7342     SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7343     SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7344     SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7345     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7346   }
7347 
7348   assert(VecSize <= 64);
7349 
7350   MVT IntVT = MVT::getIntegerVT(VecSize);
7351 
7352   // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7353   SDValue VecBC = peekThroughBitcasts(Vec);
7354   if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7355     SDValue Src = VecBC.getOperand(0);
7356     Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7357     Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7358   }
7359 
7360   unsigned EltSize = EltVT.getSizeInBits();
7361   assert(isPowerOf2_32(EltSize));
7362 
7363   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7364 
7365   // Convert vector index to bit-index (* EltSize)
7366   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7367 
7368   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7369   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7370 
7371   if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7372     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7373     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7374   }
7375 
7376   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7377 }
7378 
7379 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7380   assert(Elt % 2 == 0);
7381   return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7382 }
7383 
7384 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7385                                               SelectionDAG &DAG) const {
7386   SDLoc SL(Op);
7387   EVT ResultVT = Op.getValueType();
7388   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7389 
7390   EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7391   EVT EltVT = PackVT.getVectorElementType();
7392   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7393 
7394   // vector_shuffle <0,1,6,7> lhs, rhs
7395   // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7396   //
7397   // vector_shuffle <6,7,2,3> lhs, rhs
7398   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7399   //
7400   // vector_shuffle <6,7,0,1> lhs, rhs
7401   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7402 
7403   // Avoid scalarizing when both halves are reading from consecutive elements.
7404   SmallVector<SDValue, 4> Pieces;
7405   for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7406     if (elementPairIsContiguous(SVN->getMask(), I)) {
7407       const int Idx = SVN->getMaskElt(I);
7408       int VecIdx = Idx < SrcNumElts ? 0 : 1;
7409       int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7410       SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7411                                     PackVT, SVN->getOperand(VecIdx),
7412                                     DAG.getConstant(EltIdx, SL, MVT::i32));
7413       Pieces.push_back(SubVec);
7414     } else {
7415       const int Idx0 = SVN->getMaskElt(I);
7416       const int Idx1 = SVN->getMaskElt(I + 1);
7417       int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7418       int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7419       int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420       int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7421 
7422       SDValue Vec0 = SVN->getOperand(VecIdx0);
7423       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7424                                  Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7425 
7426       SDValue Vec1 = SVN->getOperand(VecIdx1);
7427       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7428                                  Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7429       Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7430     }
7431   }
7432 
7433   return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7434 }
7435 
7436 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7437                                                 SelectionDAG &DAG) const {
7438   SDValue SVal = Op.getOperand(0);
7439   EVT ResultVT = Op.getValueType();
7440   EVT SValVT = SVal.getValueType();
7441   SDValue UndefVal = DAG.getUNDEF(SValVT);
7442   SDLoc SL(Op);
7443 
7444   SmallVector<SDValue, 8> VElts;
7445   VElts.push_back(SVal);
7446   for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7447     VElts.push_back(UndefVal);
7448 
7449   return DAG.getBuildVector(ResultVT, SL, VElts);
7450 }
7451 
7452 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7453                                             SelectionDAG &DAG) const {
7454   SDLoc SL(Op);
7455   EVT VT = Op.getValueType();
7456 
7457   if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7458       VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7459     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
7460                                   VT.getVectorNumElements() / 2);
7461     MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7462 
7463     // Turn into pair of packed build_vectors.
7464     // TODO: Special case for constants that can be materialized with s_mov_b64.
7465     SmallVector<SDValue, 4> LoOps, HiOps;
7466     for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7467       LoOps.push_back(Op.getOperand(I));
7468       HiOps.push_back(Op.getOperand(I + E));
7469     }
7470     SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7471     SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7472 
7473     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7474     SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7475 
7476     SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7477                                        { CastLo, CastHi });
7478     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7479   }
7480 
7481   if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7482     EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
7483                                      VT.getVectorNumElements() / 4);
7484     MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7485 
7486     SmallVector<SDValue, 4> Parts[4];
7487     for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7488       for (unsigned P = 0; P < 4; ++P)
7489         Parts[P].push_back(Op.getOperand(I + P * E));
7490     }
7491     SDValue Casts[4];
7492     for (unsigned P = 0; P < 4; ++P) {
7493       SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7494       Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7495     }
7496 
7497     SDValue Blend =
7498         DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7499     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7500   }
7501 
7502   if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7503     EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
7504                                      VT.getVectorNumElements() / 8);
7505     MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7506 
7507     SmallVector<SDValue, 8> Parts[8];
7508     for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7509       for (unsigned P = 0; P < 8; ++P)
7510         Parts[P].push_back(Op.getOperand(I + P * E));
7511     }
7512     SDValue Casts[8];
7513     for (unsigned P = 0; P < 8; ++P) {
7514       SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7515       Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7516     }
7517 
7518     SDValue Blend =
7519         DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7520     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7521   }
7522 
7523   assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7524   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7525 
7526   SDValue Lo = Op.getOperand(0);
7527   SDValue Hi = Op.getOperand(1);
7528 
7529   // Avoid adding defined bits with the zero_extend.
7530   if (Hi.isUndef()) {
7531     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7532     SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7533     return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7534   }
7535 
7536   Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7537   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7538 
7539   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7540                               DAG.getConstant(16, SL, MVT::i32));
7541   if (Lo.isUndef())
7542     return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7543 
7544   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7545   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7546 
7547   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7548   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7549 }
7550 
7551 bool
7552 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
7553   // OSes that use ELF REL relocations (instead of RELA) can only store a
7554   // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555   // which can create arbitrary 64-bit addends. (This is only a problem for
7556   // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7557   // the high 32 bits of the addend.)
7558   //
7559   // This should be kept in sync with how HasRelocationAddend is initialized in
7560   // the constructor of ELFAMDGPUAsmBackend.
7561   if (!Subtarget->isAmdHsaOS())
7562     return false;
7563 
7564   // We can fold offsets for anything that doesn't require a GOT relocation.
7565   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7566           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7567           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7568          !shouldEmitGOTReloc(GA->getGlobal());
7569 }
7570 
7571 static SDValue
7572 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
7573                         const SDLoc &DL, int64_t Offset, EVT PtrVT,
7574                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
7575   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7577   // lowered to the following code sequence:
7578   //
7579   // For constant address space:
7580   //   s_getpc_b64 s[0:1]
7581   //   s_add_u32 s0, s0, $symbol
7582   //   s_addc_u32 s1, s1, 0
7583   //
7584   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
7585   //   a fixup or relocation is emitted to replace $symbol with a literal
7586   //   constant, which is a pc-relative offset from the encoding of the $symbol
7587   //   operand to the global variable.
7588   //
7589   // For global address space:
7590   //   s_getpc_b64 s[0:1]
7591   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7592   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7593   //
7594   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
7595   //   fixups or relocations are emitted to replace $symbol@*@lo and
7596   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7597   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
7598   //   operand to the global variable.
7599   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7600   SDValue PtrHi;
7601   if (GAFlags == SIInstrInfo::MO_NONE)
7602     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7603   else
7604     PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7605   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7606 }
7607 
7608 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7609                                              SDValue Op,
7610                                              SelectionDAG &DAG) const {
7611   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7612   SDLoc DL(GSD);
7613   EVT PtrVT = Op.getValueType();
7614 
7615   const GlobalValue *GV = GSD->getGlobal();
7616   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7617        shouldUseLDSConstAddress(GV)) ||
7618       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7619       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7620     if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7621         GV->hasExternalLinkage()) {
7622       Type *Ty = GV->getValueType();
7623       // HIP uses an unsized array `extern __shared__ T s[]` or similar
7624       // zero-sized type in other languages to declare the dynamic shared
7625       // memory which size is not known at the compile time. They will be
7626       // allocated by the runtime and placed directly after the static
7627       // allocated ones. They all share the same offset.
7628       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7629         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7630         // Adjust alignment for that dynamic shared memory array.
7631         Function &F = DAG.getMachineFunction().getFunction();
7632         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7633         MFI->setUsesDynamicLDS(true);
7634         return SDValue(
7635             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7636       }
7637     }
7638     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
7639   }
7640 
7641   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7642     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7643                                             SIInstrInfo::MO_ABS32_LO);
7644     return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7645   }
7646 
7647   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7648     SDValue AddrLo = DAG.getTargetGlobalAddress(
7649         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7650     AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7651 
7652     SDValue AddrHi = DAG.getTargetGlobalAddress(
7653         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7654     AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7655 
7656     return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7657   }
7658 
7659   if (shouldEmitFixup(GV))
7660     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7661 
7662   if (shouldEmitPCReloc(GV))
7663     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7664                                    SIInstrInfo::MO_REL32);
7665 
7666   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7667                                             SIInstrInfo::MO_GOTPCREL32);
7668 
7669   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7670   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
7671   const DataLayout &DataLayout = DAG.getDataLayout();
7672   Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7673   MachinePointerInfo PtrInfo
7674     = MachinePointerInfo::getGOT(DAG.getMachineFunction());
7675 
7676   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7677                      MachineMemOperand::MODereferenceable |
7678                          MachineMemOperand::MOInvariant);
7679 }
7680 
7681 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
7682                                    const SDLoc &DL, SDValue V) const {
7683   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7684   // the destination register.
7685   //
7686   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7687   // so we will end up with redundant moves to m0.
7688   //
7689   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7690 
7691   // A Null SDValue creates a glue result.
7692   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7693                                   V, Chain);
7694   return SDValue(M0, 0);
7695 }
7696 
7697 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7698                                                  SDValue Op,
7699                                                  MVT VT,
7700                                                  unsigned Offset) const {
7701   SDLoc SL(Op);
7702   SDValue Param = lowerKernargMemParameter(
7703       DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7704   // The local size values will have the hi 16-bits as zero.
7705   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7706                      DAG.getValueType(VT));
7707 }
7708 
7709 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7710                                         EVT VT) {
7711   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7712                                       "non-hsa intrinsic with hsa target",
7713                                       DL.getDebugLoc());
7714   DAG.getContext()->diagnose(BadIntrin);
7715   return DAG.getUNDEF(VT);
7716 }
7717 
7718 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7719                                          EVT VT) {
7720   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7721                                       "intrinsic not supported on subtarget",
7722                                       DL.getDebugLoc());
7723   DAG.getContext()->diagnose(BadIntrin);
7724   return DAG.getUNDEF(VT);
7725 }
7726 
7727 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
7728                                     ArrayRef<SDValue> Elts) {
7729   assert(!Elts.empty());
7730   MVT Type;
7731   unsigned NumElts = Elts.size();
7732 
7733   if (NumElts <= 12) {
7734     Type = MVT::getVectorVT(MVT::f32, NumElts);
7735   } else {
7736     assert(Elts.size() <= 16);
7737     Type = MVT::v16f32;
7738     NumElts = 16;
7739   }
7740 
7741   SmallVector<SDValue, 16> VecElts(NumElts);
7742   for (unsigned i = 0; i < Elts.size(); ++i) {
7743     SDValue Elt = Elts[i];
7744     if (Elt.getValueType() != MVT::f32)
7745       Elt = DAG.getBitcast(MVT::f32, Elt);
7746     VecElts[i] = Elt;
7747   }
7748   for (unsigned i = Elts.size(); i < NumElts; ++i)
7749     VecElts[i] = DAG.getUNDEF(MVT::f32);
7750 
7751   if (NumElts == 1)
7752     return VecElts[0];
7753   return DAG.getBuildVector(Type, DL, VecElts);
7754 }
7755 
7756 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7757                               SDValue Src, int ExtraElts) {
7758   EVT SrcVT = Src.getValueType();
7759 
7760   SmallVector<SDValue, 8> Elts;
7761 
7762   if (SrcVT.isVector())
7763     DAG.ExtractVectorElements(Src, Elts);
7764   else
7765     Elts.push_back(Src);
7766 
7767   SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7768   while (ExtraElts--)
7769     Elts.push_back(Undef);
7770 
7771   return DAG.getBuildVector(CastVT, DL, Elts);
7772 }
7773 
7774 // Re-construct the required return value for a image load intrinsic.
7775 // This is more complicated due to the optional use TexFailCtrl which means the required
7776 // return type is an aggregate
7777 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
7778                                  ArrayRef<EVT> ResultTypes, bool IsTexFail,
7779                                  bool Unpacked, bool IsD16, int DMaskPop,
7780                                  int NumVDataDwords, bool IsAtomicPacked16Bit,
7781                                  const SDLoc &DL) {
7782   // Determine the required return type. This is the same regardless of IsTexFail flag
7783   EVT ReqRetVT = ResultTypes[0];
7784   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7785   int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7786                           ? (ReqRetNumElts + 1) / 2
7787                           : ReqRetNumElts;
7788 
7789   int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7790 
7791   MVT DataDwordVT = NumDataDwords == 1 ?
7792     MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7793 
7794   MVT MaskPopVT = MaskPopDwords == 1 ?
7795     MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7796 
7797   SDValue Data(Result, 0);
7798   SDValue TexFail;
7799 
7800   if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7801     SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7802     if (MaskPopVT.isVector()) {
7803       Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7804                          SDValue(Result, 0), ZeroIdx);
7805     } else {
7806       Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7807                          SDValue(Result, 0), ZeroIdx);
7808     }
7809   }
7810 
7811   if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7812     Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7813                           NumDataDwords - MaskPopDwords);
7814 
7815   if (IsD16)
7816     Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7817 
7818   EVT LegalReqRetVT = ReqRetVT;
7819   if (!ReqRetVT.isVector()) {
7820     if (!Data.getValueType().isInteger())
7821       Data = DAG.getNode(ISD::BITCAST, DL,
7822                          Data.getValueType().changeTypeToInteger(), Data);
7823     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7824   } else {
7825     // We need to widen the return vector to a legal type
7826     if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7827         ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7828       LegalReqRetVT =
7829           EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
7830                            ReqRetVT.getVectorNumElements() + 1);
7831     }
7832   }
7833   Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7834 
7835   if (IsTexFail) {
7836     TexFail =
7837         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7838                     DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7839 
7840     return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7841   }
7842 
7843   if (Result->getNumValues() == 1)
7844     return Data;
7845 
7846   return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7847 }
7848 
7849 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7850                          SDValue *LWE, bool &IsTexFail) {
7851   auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7852 
7853   uint64_t Value = TexFailCtrlConst->getZExtValue();
7854   if (Value) {
7855     IsTexFail = true;
7856   }
7857 
7858   SDLoc DL(TexFailCtrlConst);
7859   *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7860   Value &= ~(uint64_t)0x1;
7861   *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7862   Value &= ~(uint64_t)0x2;
7863 
7864   return Value == 0;
7865 }
7866 
7867 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
7868                                       MVT PackVectorVT,
7869                                       SmallVectorImpl<SDValue> &PackedAddrs,
7870                                       unsigned DimIdx, unsigned EndIdx,
7871                                       unsigned NumGradients) {
7872   SDLoc DL(Op);
7873   for (unsigned I = DimIdx; I < EndIdx; I++) {
7874     SDValue Addr = Op.getOperand(I);
7875 
7876     // Gradients are packed with undef for each coordinate.
7877     // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7878     // 1D: undef,dx/dh; undef,dx/dv
7879     // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7880     // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7881     if (((I + 1) >= EndIdx) ||
7882         ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883                                          I == DimIdx + NumGradients - 1))) {
7884       if (Addr.getValueType() != MVT::i16)
7885         Addr = DAG.getBitcast(MVT::i16, Addr);
7886       Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7887     } else {
7888       Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7889       I++;
7890     }
7891     Addr = DAG.getBitcast(MVT::f32, Addr);
7892     PackedAddrs.push_back(Addr);
7893   }
7894 }
7895 
7896 SDValue SITargetLowering::lowerImage(SDValue Op,
7897                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
7898                                      SelectionDAG &DAG, bool WithChain) const {
7899   SDLoc DL(Op);
7900   MachineFunction &MF = DAG.getMachineFunction();
7901   const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7902   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7903       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
7904   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7905   unsigned IntrOpcode = Intr->BaseOpcode;
7906   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7907   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7908   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7909 
7910   SmallVector<EVT, 3> ResultTypes(Op->values());
7911   SmallVector<EVT, 3> OrigResultTypes(Op->values());
7912   bool IsD16 = false;
7913   bool IsG16 = false;
7914   bool IsA16 = false;
7915   SDValue VData;
7916   int NumVDataDwords = 0;
7917   bool AdjustRetType = false;
7918   bool IsAtomicPacked16Bit = false;
7919 
7920   // Offset of intrinsic arguments
7921   const unsigned ArgOffset = WithChain ? 2 : 1;
7922 
7923   unsigned DMask;
7924   unsigned DMaskLanes = 0;
7925 
7926   if (BaseOpcode->Atomic) {
7927     VData = Op.getOperand(2);
7928 
7929     IsAtomicPacked16Bit =
7930         (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931          Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7932 
7933     bool Is64Bit = VData.getValueSizeInBits() == 64;
7934     if (BaseOpcode->AtomicX2) {
7935       SDValue VData2 = Op.getOperand(3);
7936       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7937                                  {VData, VData2});
7938       if (Is64Bit)
7939         VData = DAG.getBitcast(MVT::v4i32, VData);
7940 
7941       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7942       DMask = Is64Bit ? 0xf : 0x3;
7943       NumVDataDwords = Is64Bit ? 4 : 2;
7944     } else {
7945       DMask = Is64Bit ? 0x3 : 0x1;
7946       NumVDataDwords = Is64Bit ? 2 : 1;
7947     }
7948   } else {
7949     DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7950     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7951 
7952     if (BaseOpcode->Store) {
7953       VData = Op.getOperand(2);
7954 
7955       MVT StoreVT = VData.getSimpleValueType();
7956       if (StoreVT.getScalarType() == MVT::f16) {
7957         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7958           return Op; // D16 is unsupported for this instruction
7959 
7960         IsD16 = true;
7961         VData = handleD16VData(VData, DAG, true);
7962       }
7963 
7964       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7965     } else if (!BaseOpcode->NoReturn) {
7966       // Work out the num dwords based on the dmask popcount and underlying type
7967       // and whether packing is supported.
7968       MVT LoadVT = ResultTypes[0].getSimpleVT();
7969       if (LoadVT.getScalarType() == MVT::f16) {
7970         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7971           return Op; // D16 is unsupported for this instruction
7972 
7973         IsD16 = true;
7974       }
7975 
7976       // Confirm that the return type is large enough for the dmask specified
7977       if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7978           (!LoadVT.isVector() && DMaskLanes > 1))
7979           return Op;
7980 
7981       // The sq block of gfx8 and gfx9 do not estimate register use correctly
7982       // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7983       // instructions.
7984       if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985           !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7986         NumVDataDwords = (DMaskLanes + 1) / 2;
7987       else
7988         NumVDataDwords = DMaskLanes;
7989 
7990       AdjustRetType = true;
7991     }
7992   }
7993 
7994   unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7995   SmallVector<SDValue, 4> VAddrs;
7996 
7997   // Check for 16 bit addresses or derivatives and pack if true.
7998   MVT VAddrVT =
7999       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8000   MVT VAddrScalarVT = VAddrVT.getScalarType();
8001   MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8002   IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8003 
8004   VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8005   VAddrScalarVT = VAddrVT.getScalarType();
8006   MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8007   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8008 
8009   // Push back extra arguments.
8010   for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8011     if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8012       assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8013       // Special handling of bias when A16 is on. Bias is of type half but
8014       // occupies full 32-bit.
8015       SDValue Bias = DAG.getBuildVector(
8016           MVT::v2f16, DL,
8017           {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8018       VAddrs.push_back(Bias);
8019     } else {
8020       assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8021              "Bias needs to be converted to 16 bit in A16 mode");
8022       VAddrs.push_back(Op.getOperand(ArgOffset + I));
8023     }
8024   }
8025 
8026   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8027     // 16 bit gradients are supported, but are tied to the A16 control
8028     // so both gradients and addresses must be 16 bit
8029     LLVM_DEBUG(
8030         dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8031                   "require 16 bit args for both gradients and addresses");
8032     return Op;
8033   }
8034 
8035   if (IsA16) {
8036     if (!ST->hasA16()) {
8037       LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8038                            "support 16 bit addresses\n");
8039       return Op;
8040     }
8041   }
8042 
8043   // We've dealt with incorrect input so we know that if IsA16, IsG16
8044   // are set then we have to compress/pack operands (either address,
8045   // gradient or both)
8046   // In the case where a16 and gradients are tied (no G16 support) then we
8047   // have already verified that both IsA16 and IsG16 are true
8048   if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8049     // Activate g16
8050     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8051         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8052     IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8053   }
8054 
8055   // Add gradients (packed or unpacked)
8056   if (IsG16) {
8057     // Pack the gradients
8058     // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8059     packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8060                               ArgOffset + Intr->GradientStart,
8061                               ArgOffset + Intr->CoordStart, Intr->NumGradients);
8062   } else {
8063     for (unsigned I = ArgOffset + Intr->GradientStart;
8064          I < ArgOffset + Intr->CoordStart; I++)
8065       VAddrs.push_back(Op.getOperand(I));
8066   }
8067 
8068   // Add addresses (packed or unpacked)
8069   if (IsA16) {
8070     packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8071                               ArgOffset + Intr->CoordStart, VAddrEnd,
8072                               0 /* No gradients */);
8073   } else {
8074     // Add uncompressed address
8075     for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8076       VAddrs.push_back(Op.getOperand(I));
8077   }
8078 
8079   // If the register allocator cannot place the address registers contiguously
8080   // without introducing moves, then using the non-sequential address encoding
8081   // is always preferable, since it saves VALU instructions and is usually a
8082   // wash in terms of code size or even better.
8083   //
8084   // However, we currently have no way of hinting to the register allocator that
8085   // MIMG addresses should be placed contiguously when it is possible to do so,
8086   // so force non-NSA for the common 2-address case as a heuristic.
8087   //
8088   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8089   // allocation when possible.
8090   //
8091   // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8092   // set of the remaining addresses.
8093   const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8094   const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095   const bool UseNSA = ST->hasNSAEncoding() &&
8096                       VAddrs.size() >= ST->getNSAThreshold(MF) &&
8097                       (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8098   const bool UsePartialNSA =
8099       UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8100 
8101   SDValue VAddr;
8102   if (UsePartialNSA) {
8103     VAddr = getBuildDwordsVector(DAG, DL,
8104                                  ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8105   }
8106   else if (!UseNSA) {
8107     VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8108   }
8109 
8110   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8111   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8112   SDValue Unorm;
8113   if (!BaseOpcode->Sampler) {
8114     Unorm = True;
8115   } else {
8116     uint64_t UnormConst =
8117         Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8118 
8119     Unorm = UnormConst ? True : False;
8120   }
8121 
8122   SDValue TFE;
8123   SDValue LWE;
8124   SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8125   bool IsTexFail = false;
8126   if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8127     return Op;
8128 
8129   if (IsTexFail) {
8130     if (!DMaskLanes) {
8131       // Expecting to get an error flag since TFC is on - and dmask is 0
8132       // Force dmask to be at least 1 otherwise the instruction will fail
8133       DMask = 0x1;
8134       DMaskLanes = 1;
8135       NumVDataDwords = 1;
8136     }
8137     NumVDataDwords += 1;
8138     AdjustRetType = true;
8139   }
8140 
8141   // Has something earlier tagged that the return type needs adjusting
8142   // This happens if the instruction is a load or has set TexFailCtrl flags
8143   if (AdjustRetType) {
8144     // NumVDataDwords reflects the true number of dwords required in the return type
8145     if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146       // This is a no-op load. This can be eliminated
8147       SDValue Undef = DAG.getUNDEF(Op.getValueType());
8148       if (isa<MemSDNode>(Op))
8149         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8150       return Undef;
8151     }
8152 
8153     EVT NewVT = NumVDataDwords > 1 ?
8154                   EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8155                 : MVT::i32;
8156 
8157     ResultTypes[0] = NewVT;
8158     if (ResultTypes.size() == 3) {
8159       // Original result was aggregate type used for TexFailCtrl results
8160       // The actual instruction returns as a vector type which has now been
8161       // created. Remove the aggregate result.
8162       ResultTypes.erase(&ResultTypes[1]);
8163     }
8164   }
8165 
8166   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8167   if (BaseOpcode->Atomic)
8168     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8169   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8170                AMDGPU::CPol::VOLATILE))
8171     return Op;
8172 
8173   SmallVector<SDValue, 26> Ops;
8174   if (BaseOpcode->Store || BaseOpcode->Atomic)
8175     Ops.push_back(VData); // vdata
8176   if (UsePartialNSA) {
8177     append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8178     Ops.push_back(VAddr);
8179   }
8180   else if (UseNSA)
8181     append_range(Ops, VAddrs);
8182   else
8183     Ops.push_back(VAddr);
8184   Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8185   if (BaseOpcode->Sampler)
8186     Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8187   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8188   if (IsGFX10Plus)
8189     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8190   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8191     Ops.push_back(Unorm);
8192   Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8193   Ops.push_back(IsA16 &&  // r128, a16 for gfx9
8194                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8195   if (IsGFX10Plus)
8196     Ops.push_back(IsA16 ? True : False);
8197   if (!Subtarget->hasGFX90AInsts()) {
8198     Ops.push_back(TFE); //tfe
8199   } else if (TFE->getAsZExtVal()) {
8200     report_fatal_error("TFE is not supported on this GPU");
8201   }
8202   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8203     Ops.push_back(LWE); // lwe
8204   if (!IsGFX10Plus)
8205     Ops.push_back(DimInfo->DA ? True : False);
8206   if (BaseOpcode->HasD16)
8207     Ops.push_back(IsD16 ? True : False);
8208   if (isa<MemSDNode>(Op))
8209     Ops.push_back(Op.getOperand(0)); // chain
8210 
8211   int NumVAddrDwords =
8212       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8213   int Opcode = -1;
8214 
8215   if (IsGFX12Plus) {
8216     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8217                                    NumVDataDwords, NumVAddrDwords);
8218   } else if (IsGFX11Plus) {
8219     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8220                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
8221                                           : AMDGPU::MIMGEncGfx11Default,
8222                                    NumVDataDwords, NumVAddrDwords);
8223   } else if (IsGFX10Plus) {
8224     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8225                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
8226                                           : AMDGPU::MIMGEncGfx10Default,
8227                                    NumVDataDwords, NumVAddrDwords);
8228   } else {
8229     if (Subtarget->hasGFX90AInsts()) {
8230       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8231                                      NumVDataDwords, NumVAddrDwords);
8232       if (Opcode == -1)
8233         report_fatal_error(
8234             "requested image instruction is not supported on this GPU");
8235     }
8236     if (Opcode == -1 &&
8237         Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8238       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8239                                      NumVDataDwords, NumVAddrDwords);
8240     if (Opcode == -1)
8241       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8242                                      NumVDataDwords, NumVAddrDwords);
8243   }
8244   if (Opcode == -1)
8245     return Op;
8246 
8247   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8248   if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8249     MachineMemOperand *MemRef = MemOp->getMemOperand();
8250     DAG.setNodeMemRefs(NewNode, {MemRef});
8251   }
8252 
8253   if (BaseOpcode->AtomicX2) {
8254     SmallVector<SDValue, 1> Elt;
8255     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8256     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8257   }
8258   if (BaseOpcode->NoReturn)
8259     return SDValue(NewNode, 0);
8260   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8261                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8262                            NumVDataDwords, IsAtomicPacked16Bit, DL);
8263 }
8264 
8265 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8266                                        SDValue Offset, SDValue CachePolicy,
8267                                        SelectionDAG &DAG) const {
8268   MachineFunction &MF = DAG.getMachineFunction();
8269 
8270   const DataLayout &DataLayout = DAG.getDataLayout();
8271   Align Alignment =
8272       DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
8273 
8274   MachineMemOperand *MMO = MF.getMachineMemOperand(
8275       MachinePointerInfo(),
8276       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8277           MachineMemOperand::MOInvariant,
8278       VT.getStoreSize(), Alignment);
8279 
8280   if (!Offset->isDivergent()) {
8281     SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8282 
8283     // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8284     // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8285     // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8286     // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8287     if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8288       SDValue BufferLoad =
8289           DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
8290                                   DAG.getVTList(MVT::i32), Ops, VT, MMO);
8291       return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8292     }
8293 
8294     // Widen vec3 load to vec4.
8295     if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8296         !Subtarget->hasScalarDwordx3Loads()) {
8297       EVT WidenedVT =
8298           EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
8299       auto WidenedOp = DAG.getMemIntrinsicNode(
8300           AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8301           MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8302       auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8303                                    DAG.getVectorIdxConstant(0, DL));
8304       return Subvector;
8305     }
8306 
8307     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
8308                                    DAG.getVTList(VT), Ops, VT, MMO);
8309   }
8310 
8311   // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8312   // assume that the buffer is unswizzled.
8313   SDValue Ops[] = {
8314       DAG.getEntryNode(),                    // Chain
8315       Rsrc,                                  // rsrc
8316       DAG.getConstant(0, DL, MVT::i32),      // vindex
8317       {},                                    // voffset
8318       {},                                    // soffset
8319       {},                                    // offset
8320       CachePolicy,                           // cachepolicy
8321       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8322   };
8323   if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8324     setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8325     return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8326   }
8327 
8328   SmallVector<SDValue, 4> Loads;
8329   unsigned NumLoads = 1;
8330   MVT LoadVT = VT.getSimpleVT();
8331   unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8332   assert((LoadVT.getScalarType() == MVT::i32 ||
8333           LoadVT.getScalarType() == MVT::f32));
8334 
8335   if (NumElts == 8 || NumElts == 16) {
8336     NumLoads = NumElts / 4;
8337     LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8338   }
8339 
8340   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8341 
8342   // Use the alignment to ensure that the required offsets will fit into the
8343   // immediate offsets.
8344   setBufferOffsets(Offset, DAG, &Ops[3],
8345                    NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8346 
8347   uint64_t InstOffset = Ops[5]->getAsZExtVal();
8348   for (unsigned i = 0; i < NumLoads; ++i) {
8349     Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8350     Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8351                                         LoadVT, MMO, DAG));
8352   }
8353 
8354   if (NumElts == 8 || NumElts == 16)
8355     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8356 
8357   return Loads[0];
8358 }
8359 
8360 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8361   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8362   if (!Subtarget->hasArchitectedSGPRs())
8363     return {};
8364   SDLoc SL(Op);
8365   MVT VT = MVT::i32;
8366   SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8367   return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8368                      DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8369 }
8370 
8371 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8372                                           unsigned Dim,
8373                                           const ArgDescriptor &Arg) const {
8374   SDLoc SL(Op);
8375   MachineFunction &MF = DAG.getMachineFunction();
8376   unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8377   if (MaxID == 0)
8378     return DAG.getConstant(0, SL, MVT::i32);
8379 
8380   SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8381                                SDLoc(DAG.getEntryNode()), Arg);
8382 
8383   // Don't bother inserting AssertZext for packed IDs since we're emitting the
8384   // masking operations anyway.
8385   //
8386   // TODO: We could assert the top bit is 0 for the source copy.
8387   if (Arg.isMasked())
8388     return Val;
8389 
8390   // Preserve the known bits after expansion to a copy.
8391   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
8392   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8393                      DAG.getValueType(SmallVT));
8394 }
8395 
8396 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8397                                                   SelectionDAG &DAG) const {
8398   MachineFunction &MF = DAG.getMachineFunction();
8399   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8400 
8401   EVT VT = Op.getValueType();
8402   SDLoc DL(Op);
8403   unsigned IntrinsicID = Op.getConstantOperandVal(0);
8404 
8405   // TODO: Should this propagate fast-math-flags?
8406 
8407   switch (IntrinsicID) {
8408   case Intrinsic::amdgcn_implicit_buffer_ptr: {
8409     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8410       return emitNonHSAIntrinsicError(DAG, DL, VT);
8411     return getPreloadedValue(DAG, *MFI, VT,
8412                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8413   }
8414   case Intrinsic::amdgcn_dispatch_ptr:
8415   case Intrinsic::amdgcn_queue_ptr: {
8416     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8417       DiagnosticInfoUnsupported BadIntrin(
8418           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8419           DL.getDebugLoc());
8420       DAG.getContext()->diagnose(BadIntrin);
8421       return DAG.getUNDEF(VT);
8422     }
8423 
8424     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8425       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
8426     return getPreloadedValue(DAG, *MFI, VT, RegID);
8427   }
8428   case Intrinsic::amdgcn_implicitarg_ptr: {
8429     if (MFI->isEntryFunction())
8430       return getImplicitArgPtr(DAG, DL);
8431     return getPreloadedValue(DAG, *MFI, VT,
8432                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
8433   }
8434   case Intrinsic::amdgcn_kernarg_segment_ptr: {
8435     if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
8436       // This only makes sense to call in a kernel, so just lower to null.
8437       return DAG.getConstant(0, DL, VT);
8438     }
8439 
8440     return getPreloadedValue(DAG, *MFI, VT,
8441                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8442   }
8443   case Intrinsic::amdgcn_dispatch_id: {
8444     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8445   }
8446   case Intrinsic::amdgcn_rcp:
8447     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8448   case Intrinsic::amdgcn_rsq:
8449     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8450   case Intrinsic::amdgcn_rsq_legacy:
8451     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8452       return emitRemovedIntrinsicError(DAG, DL, VT);
8453     return SDValue();
8454   case Intrinsic::amdgcn_rcp_legacy:
8455     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8456       return emitRemovedIntrinsicError(DAG, DL, VT);
8457     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8458   case Intrinsic::amdgcn_rsq_clamp: {
8459     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8460       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8461 
8462     Type *Type = VT.getTypeForEVT(*DAG.getContext());
8463     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
8464     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
8465 
8466     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8467     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8468                               DAG.getConstantFP(Max, DL, VT));
8469     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8470                        DAG.getConstantFP(Min, DL, VT));
8471   }
8472   case Intrinsic::r600_read_ngroups_x:
8473     if (Subtarget->isAmdHsaOS())
8474       return emitNonHSAIntrinsicError(DAG, DL, VT);
8475 
8476     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8477                                     SI::KernelInputOffsets::NGROUPS_X, Align(4),
8478                                     false);
8479   case Intrinsic::r600_read_ngroups_y:
8480     if (Subtarget->isAmdHsaOS())
8481       return emitNonHSAIntrinsicError(DAG, DL, VT);
8482 
8483     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8484                                     SI::KernelInputOffsets::NGROUPS_Y, Align(4),
8485                                     false);
8486   case Intrinsic::r600_read_ngroups_z:
8487     if (Subtarget->isAmdHsaOS())
8488       return emitNonHSAIntrinsicError(DAG, DL, VT);
8489 
8490     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8491                                     SI::KernelInputOffsets::NGROUPS_Z, Align(4),
8492                                     false);
8493   case Intrinsic::r600_read_global_size_x:
8494     if (Subtarget->isAmdHsaOS())
8495       return emitNonHSAIntrinsicError(DAG, DL, VT);
8496 
8497     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8498                                     SI::KernelInputOffsets::GLOBAL_SIZE_X,
8499                                     Align(4), false);
8500   case Intrinsic::r600_read_global_size_y:
8501     if (Subtarget->isAmdHsaOS())
8502       return emitNonHSAIntrinsicError(DAG, DL, VT);
8503 
8504     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8505                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y,
8506                                     Align(4), false);
8507   case Intrinsic::r600_read_global_size_z:
8508     if (Subtarget->isAmdHsaOS())
8509       return emitNonHSAIntrinsicError(DAG, DL, VT);
8510 
8511     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8512                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z,
8513                                     Align(4), false);
8514   case Intrinsic::r600_read_local_size_x:
8515     if (Subtarget->isAmdHsaOS())
8516       return emitNonHSAIntrinsicError(DAG, DL, VT);
8517 
8518     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8519                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
8520   case Intrinsic::r600_read_local_size_y:
8521     if (Subtarget->isAmdHsaOS())
8522       return emitNonHSAIntrinsicError(DAG, DL, VT);
8523 
8524     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8525                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
8526   case Intrinsic::r600_read_local_size_z:
8527     if (Subtarget->isAmdHsaOS())
8528       return emitNonHSAIntrinsicError(DAG, DL, VT);
8529 
8530     return lowerImplicitZextParam(DAG, Op, MVT::i16,
8531                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
8532   case Intrinsic::amdgcn_workgroup_id_x:
8533     return getPreloadedValue(DAG, *MFI, VT,
8534                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8535   case Intrinsic::amdgcn_workgroup_id_y:
8536     return getPreloadedValue(DAG, *MFI, VT,
8537                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8538   case Intrinsic::amdgcn_workgroup_id_z:
8539     return getPreloadedValue(DAG, *MFI, VT,
8540                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8541   case Intrinsic::amdgcn_wave_id:
8542     return lowerWaveID(DAG, Op);
8543   case Intrinsic::amdgcn_lds_kernel_id: {
8544     if (MFI->isEntryFunction())
8545       return getLDSKernelId(DAG, DL);
8546     return getPreloadedValue(DAG, *MFI, VT,
8547                              AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8548   }
8549   case Intrinsic::amdgcn_workitem_id_x:
8550     return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8551   case Intrinsic::amdgcn_workitem_id_y:
8552     return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8553   case Intrinsic::amdgcn_workitem_id_z:
8554     return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8555   case Intrinsic::amdgcn_wavefrontsize:
8556     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
8557                            SDLoc(Op), MVT::i32);
8558   case Intrinsic::amdgcn_s_buffer_load: {
8559     unsigned CPol = Op.getConstantOperandVal(3);
8560     // s_buffer_load, because of how it's optimized, can't be volatile
8561     // so reject ones with the volatile bit set.
8562     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8563                      ? AMDGPU::CPol::ALL
8564                      : AMDGPU::CPol::ALL_pregfx12))
8565       return Op;
8566     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8567                         DAG);
8568   }
8569   case Intrinsic::amdgcn_fdiv_fast:
8570     return lowerFDIV_FAST(Op, DAG);
8571   case Intrinsic::amdgcn_sin:
8572     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8573 
8574   case Intrinsic::amdgcn_cos:
8575     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8576 
8577   case Intrinsic::amdgcn_mul_u24:
8578     return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8579   case Intrinsic::amdgcn_mul_i24:
8580     return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8581 
8582   case Intrinsic::amdgcn_log_clamp: {
8583     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8584       return SDValue();
8585 
8586     return emitRemovedIntrinsicError(DAG, DL, VT);
8587   }
8588   case Intrinsic::amdgcn_fract:
8589     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8590 
8591   case Intrinsic::amdgcn_class:
8592     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8593                        Op.getOperand(1), Op.getOperand(2));
8594   case Intrinsic::amdgcn_div_fmas:
8595     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8596                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8597                        Op.getOperand(4));
8598 
8599   case Intrinsic::amdgcn_div_fixup:
8600     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8601                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8602 
8603   case Intrinsic::amdgcn_div_scale: {
8604     const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8605 
8606     // Translate to the operands expected by the machine instruction. The
8607     // first parameter must be the same as the first instruction.
8608     SDValue Numerator = Op.getOperand(1);
8609     SDValue Denominator = Op.getOperand(2);
8610 
8611     // Note this order is opposite of the machine instruction's operations,
8612     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8613     // intrinsic has the numerator as the first operand to match a normal
8614     // division operation.
8615 
8616     SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8617 
8618     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8619                        Denominator, Numerator);
8620   }
8621   case Intrinsic::amdgcn_icmp: {
8622     // There is a Pat that handles this variant, so return it as-is.
8623     if (Op.getOperand(1).getValueType() == MVT::i1 &&
8624         Op.getConstantOperandVal(2) == 0 &&
8625         Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8626       return Op;
8627     return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8628   }
8629   case Intrinsic::amdgcn_fcmp: {
8630     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8631   }
8632   case Intrinsic::amdgcn_ballot:
8633     return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8634   case Intrinsic::amdgcn_fmed3:
8635     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8636                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8637   case Intrinsic::amdgcn_fdot2:
8638     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8639                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8640                        Op.getOperand(4));
8641   case Intrinsic::amdgcn_fmul_legacy:
8642     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8643                        Op.getOperand(1), Op.getOperand(2));
8644   case Intrinsic::amdgcn_sffbh:
8645     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8646   case Intrinsic::amdgcn_sbfe:
8647     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8648                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8649   case Intrinsic::amdgcn_ubfe:
8650     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8651                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8652   case Intrinsic::amdgcn_cvt_pkrtz:
8653   case Intrinsic::amdgcn_cvt_pknorm_i16:
8654   case Intrinsic::amdgcn_cvt_pknorm_u16:
8655   case Intrinsic::amdgcn_cvt_pk_i16:
8656   case Intrinsic::amdgcn_cvt_pk_u16: {
8657     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8658     EVT VT = Op.getValueType();
8659     unsigned Opcode;
8660 
8661     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8662       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
8663     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8664       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8665     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8666       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8667     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8668       Opcode = AMDGPUISD::CVT_PK_I16_I32;
8669     else
8670       Opcode = AMDGPUISD::CVT_PK_U16_U32;
8671 
8672     if (isTypeLegal(VT))
8673       return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8674 
8675     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8676                                Op.getOperand(1), Op.getOperand(2));
8677     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8678   }
8679   case Intrinsic::amdgcn_fmad_ftz:
8680     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8681                        Op.getOperand(2), Op.getOperand(3));
8682 
8683   case Intrinsic::amdgcn_if_break:
8684     return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8685                                       Op->getOperand(1), Op->getOperand(2)), 0);
8686 
8687   case Intrinsic::amdgcn_groupstaticsize: {
8688     Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
8689     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8690       return Op;
8691 
8692     const Module *M = MF.getFunction().getParent();
8693     const GlobalValue *GV =
8694         M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8695     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8696                                             SIInstrInfo::MO_ABS32_LO);
8697     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8698   }
8699   case Intrinsic::amdgcn_is_shared:
8700   case Intrinsic::amdgcn_is_private: {
8701     SDLoc SL(Op);
8702     unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8703       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
8704     SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8705     SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8706                                  Op.getOperand(1));
8707 
8708     SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8709                                 DAG.getConstant(1, SL, MVT::i32));
8710     return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8711   }
8712   case Intrinsic::amdgcn_perm:
8713     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8714                        Op.getOperand(2), Op.getOperand(3));
8715   case Intrinsic::amdgcn_reloc_constant: {
8716     Module *M = const_cast<Module *>(MF.getFunction().getParent());
8717     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8718     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8719     auto RelocSymbol = cast<GlobalVariable>(
8720         M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8721     SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8722                                             SIInstrInfo::MO_ABS32_LO);
8723     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8724   }
8725   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8726   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8727   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8728   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8729   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8730   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8731   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8732   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8733     if (Op.getOperand(4).getValueType() == MVT::i32)
8734       return SDValue();
8735 
8736     SDLoc SL(Op);
8737     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8738     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8739                        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8740                        Op.getOperand(3), IndexKeyi32);
8741   }
8742   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8743   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8744   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8745     if (Op.getOperand(6).getValueType() == MVT::i32)
8746       return SDValue();
8747 
8748     SDLoc SL(Op);
8749     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8750     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8751                        {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8752                         Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8753                         IndexKeyi32, Op.getOperand(7)});
8754   }
8755   case Intrinsic::amdgcn_addrspacecast_nonnull:
8756     return lowerADDRSPACECAST(Op, DAG);
8757   case Intrinsic::amdgcn_readlane:
8758   case Intrinsic::amdgcn_readfirstlane:
8759   case Intrinsic::amdgcn_writelane:
8760   case Intrinsic::amdgcn_permlane16:
8761   case Intrinsic::amdgcn_permlanex16:
8762   case Intrinsic::amdgcn_permlane64:
8763     return lowerLaneOp(*this, Op.getNode(), DAG);
8764   default:
8765     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8766             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
8767       return lowerImage(Op, ImageDimIntr, DAG, false);
8768 
8769     return Op;
8770   }
8771 }
8772 
8773 // On targets not supporting constant in soffset field, turn zero to
8774 // SGPR_NULL to avoid generating an extra s_mov with zero.
8775 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
8776                              const GCNSubtarget *Subtarget) {
8777   if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8778     return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8779   return SOffset;
8780 }
8781 
8782 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8783                                                      SelectionDAG &DAG,
8784                                                      unsigned NewOpcode) const {
8785   SDLoc DL(Op);
8786 
8787   SDValue VData = Op.getOperand(2);
8788   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8789   auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8790   auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8791   SDValue Ops[] = {
8792       Op.getOperand(0),                      // Chain
8793       VData,                                 // vdata
8794       Rsrc,                                  // rsrc
8795       DAG.getConstant(0, DL, MVT::i32),      // vindex
8796       Offsets.first,                         // voffset
8797       SOffset,                               // soffset
8798       Offsets.second,                        // offset
8799       Op.getOperand(6),                      // cachepolicy
8800       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8801   };
8802 
8803   auto *M = cast<MemSDNode>(Op);
8804 
8805   EVT MemVT = VData.getValueType();
8806   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8807                                  M->getMemOperand());
8808 }
8809 
8810 SDValue
8811 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8812                                                 unsigned NewOpcode) const {
8813   SDLoc DL(Op);
8814 
8815   SDValue VData = Op.getOperand(2);
8816   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8817   auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8818   auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8819   SDValue Ops[] = {
8820       Op.getOperand(0),                      // Chain
8821       VData,                                 // vdata
8822       Rsrc,                                  // rsrc
8823       Op.getOperand(4),                      // vindex
8824       Offsets.first,                         // voffset
8825       SOffset,                               // soffset
8826       Offsets.second,                        // offset
8827       Op.getOperand(7),                      // cachepolicy
8828       DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8829   };
8830 
8831   auto *M = cast<MemSDNode>(Op);
8832 
8833   EVT MemVT = VData.getValueType();
8834   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8835                                  M->getMemOperand());
8836 }
8837 
8838 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8839                                                  SelectionDAG &DAG) const {
8840   unsigned IntrID = Op.getConstantOperandVal(1);
8841   SDLoc DL(Op);
8842 
8843   switch (IntrID) {
8844   case Intrinsic::amdgcn_ds_ordered_add:
8845   case Intrinsic::amdgcn_ds_ordered_swap: {
8846     MemSDNode *M = cast<MemSDNode>(Op);
8847     SDValue Chain = M->getOperand(0);
8848     SDValue M0 = M->getOperand(2);
8849     SDValue Value = M->getOperand(3);
8850     unsigned IndexOperand = M->getConstantOperandVal(7);
8851     unsigned WaveRelease = M->getConstantOperandVal(8);
8852     unsigned WaveDone = M->getConstantOperandVal(9);
8853 
8854     unsigned OrderedCountIndex = IndexOperand & 0x3f;
8855     IndexOperand &= ~0x3f;
8856     unsigned CountDw = 0;
8857 
8858     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8859       CountDw = (IndexOperand >> 24) & 0xf;
8860       IndexOperand &= ~(0xf << 24);
8861 
8862       if (CountDw < 1 || CountDw > 4) {
8863         report_fatal_error(
8864             "ds_ordered_count: dword count must be between 1 and 4");
8865       }
8866     }
8867 
8868     if (IndexOperand)
8869       report_fatal_error("ds_ordered_count: bad index operand");
8870 
8871     if (WaveDone && !WaveRelease)
8872       report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8873 
8874     unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8875     unsigned ShaderType =
8876         SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
8877     unsigned Offset0 = OrderedCountIndex << 2;
8878     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8879 
8880     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881       Offset1 |= (CountDw - 1) << 6;
8882 
8883     if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8884       Offset1 |= ShaderType << 2;
8885 
8886     unsigned Offset = Offset0 | (Offset1 << 8);
8887 
8888     SDValue Ops[] = {
8889       Chain,
8890       Value,
8891       DAG.getTargetConstant(Offset, DL, MVT::i16),
8892       copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8893     };
8894     return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
8895                                    M->getVTList(), Ops, M->getMemoryVT(),
8896                                    M->getMemOperand());
8897   }
8898   case Intrinsic::amdgcn_raw_buffer_load:
8899   case Intrinsic::amdgcn_raw_ptr_buffer_load:
8900   case Intrinsic::amdgcn_raw_atomic_buffer_load:
8901   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8902   case Intrinsic::amdgcn_raw_buffer_load_format:
8903   case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8904     const bool IsFormat =
8905         IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8906         IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8907 
8908     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8909     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8910     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8911     SDValue Ops[] = {
8912         Op.getOperand(0),                      // Chain
8913         Rsrc,                                  // rsrc
8914         DAG.getConstant(0, DL, MVT::i32),      // vindex
8915         Offsets.first,                         // voffset
8916         SOffset,                               // soffset
8917         Offsets.second,                        // offset
8918         Op.getOperand(5),                      // cachepolicy, swizzled buffer
8919         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8920     };
8921 
8922     auto *M = cast<MemSDNode>(Op);
8923     return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8924   }
8925   case Intrinsic::amdgcn_struct_buffer_load:
8926   case Intrinsic::amdgcn_struct_ptr_buffer_load:
8927   case Intrinsic::amdgcn_struct_buffer_load_format:
8928   case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8929     const bool IsFormat =
8930         IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8931         IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8932 
8933     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8934     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8935     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8936     SDValue Ops[] = {
8937         Op.getOperand(0),                      // Chain
8938         Rsrc,                                  // rsrc
8939         Op.getOperand(3),                      // vindex
8940         Offsets.first,                         // voffset
8941         SOffset,                               // soffset
8942         Offsets.second,                        // offset
8943         Op.getOperand(6),                      // cachepolicy, swizzled buffer
8944         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8945     };
8946 
8947     return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8948   }
8949   case Intrinsic::amdgcn_raw_tbuffer_load:
8950   case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8951     MemSDNode *M = cast<MemSDNode>(Op);
8952     EVT LoadVT = Op.getValueType();
8953     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8954     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8955     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8956 
8957     SDValue Ops[] = {
8958         Op.getOperand(0),                      // Chain
8959         Rsrc,                                  // rsrc
8960         DAG.getConstant(0, DL, MVT::i32),      // vindex
8961         Offsets.first,                         // voffset
8962         SOffset,                               // soffset
8963         Offsets.second,                        // offset
8964         Op.getOperand(5),                      // format
8965         Op.getOperand(6),                      // cachepolicy, swizzled buffer
8966         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8967     };
8968 
8969     if (LoadVT.getScalarType() == MVT::f16)
8970       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8971                                  M, DAG, Ops);
8972     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8973                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8974                                DAG);
8975   }
8976   case Intrinsic::amdgcn_struct_tbuffer_load:
8977   case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8978     MemSDNode *M = cast<MemSDNode>(Op);
8979     EVT LoadVT = Op.getValueType();
8980     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8981     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8982     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8983 
8984     SDValue Ops[] = {
8985         Op.getOperand(0),                      // Chain
8986         Rsrc,                                  // rsrc
8987         Op.getOperand(3),                      // vindex
8988         Offsets.first,                         // voffset
8989         SOffset,                               // soffset
8990         Offsets.second,                        // offset
8991         Op.getOperand(6),                      // format
8992         Op.getOperand(7),                      // cachepolicy, swizzled buffer
8993         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8994     };
8995 
8996     if (LoadVT.getScalarType() == MVT::f16)
8997       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8998                                  M, DAG, Ops);
8999     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9000                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9001                                DAG);
9002   }
9003   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9004   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9005     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9006   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9007   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9008     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9009   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9010   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9011     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9012   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9013   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9014     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9015   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9016   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9017     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9018   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9019   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9020     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9021   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9022   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9023     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9024   case Intrinsic::amdgcn_raw_buffer_atomic_add:
9025   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9026     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9027   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9028   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9029     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9030   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9031   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9032     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9033   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9034   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9035     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9036   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9037   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9038     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9039   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9040   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9041     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9042   case Intrinsic::amdgcn_raw_buffer_atomic_and:
9043   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9044     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9045   case Intrinsic::amdgcn_raw_buffer_atomic_or:
9046   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9047     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9048   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9049   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9050     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9051   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9052   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9053     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9054   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9055   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9056     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9057   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9058     return lowerRawBufferAtomicIntrin(Op, DAG,
9059                                       AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9060   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9061   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9062     return lowerStructBufferAtomicIntrin(Op, DAG,
9063                                          AMDGPUISD::BUFFER_ATOMIC_SWAP);
9064   case Intrinsic::amdgcn_struct_buffer_atomic_add:
9065   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9066     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9067   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9068   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9069     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9070   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9071   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9072     return lowerStructBufferAtomicIntrin(Op, DAG,
9073                                          AMDGPUISD::BUFFER_ATOMIC_SMIN);
9074   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9075   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9076     return lowerStructBufferAtomicIntrin(Op, DAG,
9077                                          AMDGPUISD::BUFFER_ATOMIC_UMIN);
9078   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9079   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9080     return lowerStructBufferAtomicIntrin(Op, DAG,
9081                                          AMDGPUISD::BUFFER_ATOMIC_SMAX);
9082   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9083   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9084     return lowerStructBufferAtomicIntrin(Op, DAG,
9085                                          AMDGPUISD::BUFFER_ATOMIC_UMAX);
9086   case Intrinsic::amdgcn_struct_buffer_atomic_and:
9087   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9088     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9089   case Intrinsic::amdgcn_struct_buffer_atomic_or:
9090   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9091     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9092   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9093   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9094     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9095   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9096   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9097     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9098   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9099   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9100     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9101   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9102     return lowerStructBufferAtomicIntrin(Op, DAG,
9103                                          AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9104 
9105   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9106   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9107     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9108     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9109     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9110     SDValue Ops[] = {
9111         Op.getOperand(0),                      // Chain
9112         Op.getOperand(2),                      // src
9113         Op.getOperand(3),                      // cmp
9114         Rsrc,                                  // rsrc
9115         DAG.getConstant(0, DL, MVT::i32),      // vindex
9116         Offsets.first,                         // voffset
9117         SOffset,                               // soffset
9118         Offsets.second,                        // offset
9119         Op.getOperand(7),                      // cachepolicy
9120         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9121     };
9122     EVT VT = Op.getValueType();
9123     auto *M = cast<MemSDNode>(Op);
9124 
9125     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9126                                    Op->getVTList(), Ops, VT, M->getMemOperand());
9127   }
9128   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9129   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9130     SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9131     auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9132     auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9133     SDValue Ops[] = {
9134         Op.getOperand(0),                      // Chain
9135         Op.getOperand(2),                      // src
9136         Op.getOperand(3),                      // cmp
9137         Rsrc,                                  // rsrc
9138         Op.getOperand(5),                      // vindex
9139         Offsets.first,                         // voffset
9140         SOffset,                               // soffset
9141         Offsets.second,                        // offset
9142         Op.getOperand(8),                      // cachepolicy
9143         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9144     };
9145     EVT VT = Op.getValueType();
9146     auto *M = cast<MemSDNode>(Op);
9147 
9148     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9149                                    Op->getVTList(), Ops, VT, M->getMemOperand());
9150   }
9151   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9152     MemSDNode *M = cast<MemSDNode>(Op);
9153     SDValue NodePtr = M->getOperand(2);
9154     SDValue RayExtent = M->getOperand(3);
9155     SDValue RayOrigin = M->getOperand(4);
9156     SDValue RayDir = M->getOperand(5);
9157     SDValue RayInvDir = M->getOperand(6);
9158     SDValue TDescr = M->getOperand(7);
9159 
9160     assert(NodePtr.getValueType() == MVT::i32 ||
9161            NodePtr.getValueType() == MVT::i64);
9162     assert(RayDir.getValueType() == MVT::v3f16 ||
9163            RayDir.getValueType() == MVT::v3f32);
9164 
9165     if (!Subtarget->hasGFX10_AEncoding()) {
9166       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9167       return SDValue();
9168     }
9169 
9170     const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9171     const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9172     const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9173     const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9174     const bool Is64 = NodePtr.getValueType() == MVT::i64;
9175     const unsigned NumVDataDwords = 4;
9176     const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9177     const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9178     const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9179                          NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9180                         IsGFX12Plus;
9181     const unsigned BaseOpcodes[2][2] = {
9182         {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9183         {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9184          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9185     int Opcode;
9186     if (UseNSA) {
9187       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9188                                      IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9189                                      : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
9190                                                  : AMDGPU::MIMGEncGfx10NSA,
9191                                      NumVDataDwords, NumVAddrDwords);
9192     } else {
9193       assert(!IsGFX12Plus);
9194       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9195                                      IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9196                                              : AMDGPU::MIMGEncGfx10Default,
9197                                      NumVDataDwords, NumVAddrDwords);
9198     }
9199     assert(Opcode != -1);
9200 
9201     SmallVector<SDValue, 16> Ops;
9202 
9203     auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9204       SmallVector<SDValue, 3> Lanes;
9205       DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9206       if (Lanes[0].getValueSizeInBits() == 32) {
9207         for (unsigned I = 0; I < 3; ++I)
9208           Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9209       } else {
9210         if (IsAligned) {
9211           Ops.push_back(
9212             DAG.getBitcast(MVT::i32,
9213                            DAG.getBuildVector(MVT::v2f16, DL,
9214                                               { Lanes[0], Lanes[1] })));
9215           Ops.push_back(Lanes[2]);
9216         } else {
9217           SDValue Elt0 = Ops.pop_back_val();
9218           Ops.push_back(
9219             DAG.getBitcast(MVT::i32,
9220                            DAG.getBuildVector(MVT::v2f16, DL,
9221                                               { Elt0, Lanes[0] })));
9222           Ops.push_back(
9223             DAG.getBitcast(MVT::i32,
9224                            DAG.getBuildVector(MVT::v2f16, DL,
9225                                               { Lanes[1], Lanes[2] })));
9226         }
9227       }
9228     };
9229 
9230     if (UseNSA && IsGFX11Plus) {
9231       Ops.push_back(NodePtr);
9232       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9233       Ops.push_back(RayOrigin);
9234       if (IsA16) {
9235         SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9236         DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9237         DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9238         for (unsigned I = 0; I < 3; ++I) {
9239           MergedLanes.push_back(DAG.getBitcast(
9240               MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9241                                            {DirLanes[I], InvDirLanes[I]})));
9242         }
9243         Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9244       } else {
9245         Ops.push_back(RayDir);
9246         Ops.push_back(RayInvDir);
9247       }
9248     } else {
9249       if (Is64)
9250         DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9251                                   2);
9252       else
9253         Ops.push_back(NodePtr);
9254 
9255       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9256       packLanes(RayOrigin, true);
9257       packLanes(RayDir, true);
9258       packLanes(RayInvDir, false);
9259     }
9260 
9261     if (!UseNSA) {
9262       // Build a single vector containing all the operands so far prepared.
9263       if (NumVAddrDwords > 12) {
9264         SDValue Undef = DAG.getUNDEF(MVT::i32);
9265         Ops.append(16 - Ops.size(), Undef);
9266       }
9267       assert(Ops.size() >= 8 && Ops.size() <= 12);
9268       SDValue MergedOps = DAG.getBuildVector(
9269           MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9270       Ops.clear();
9271       Ops.push_back(MergedOps);
9272     }
9273 
9274     Ops.push_back(TDescr);
9275     Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9276     Ops.push_back(M->getChain());
9277 
9278     auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9279     MachineMemOperand *MemRef = M->getMemOperand();
9280     DAG.setNodeMemRefs(NewNode, {MemRef});
9281     return SDValue(NewNode, 0);
9282   }
9283   case Intrinsic::amdgcn_global_atomic_fmin:
9284   case Intrinsic::amdgcn_global_atomic_fmax:
9285   case Intrinsic::amdgcn_global_atomic_fmin_num:
9286   case Intrinsic::amdgcn_global_atomic_fmax_num:
9287   case Intrinsic::amdgcn_flat_atomic_fmin:
9288   case Intrinsic::amdgcn_flat_atomic_fmax:
9289   case Intrinsic::amdgcn_flat_atomic_fmin_num:
9290   case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9291     MemSDNode *M = cast<MemSDNode>(Op);
9292     SDValue Ops[] = {
9293       M->getOperand(0), // Chain
9294       M->getOperand(2), // Ptr
9295       M->getOperand(3)  // Value
9296     };
9297     unsigned Opcode = 0;
9298     switch (IntrID) {
9299     case Intrinsic::amdgcn_global_atomic_fmin:
9300     case Intrinsic::amdgcn_global_atomic_fmin_num:
9301     case Intrinsic::amdgcn_flat_atomic_fmin:
9302     case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9303       Opcode = ISD::ATOMIC_LOAD_FMIN;
9304       break;
9305     }
9306     case Intrinsic::amdgcn_global_atomic_fmax:
9307     case Intrinsic::amdgcn_global_atomic_fmax_num:
9308     case Intrinsic::amdgcn_flat_atomic_fmax:
9309     case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9310       Opcode = ISD::ATOMIC_LOAD_FMAX;
9311       break;
9312     }
9313     default:
9314       llvm_unreachable("unhandled atomic opcode");
9315     }
9316     return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9317                          Ops, M->getMemOperand());
9318   }
9319   case Intrinsic::amdgcn_s_get_barrier_state: {
9320     SDValue Chain = Op->getOperand(0);
9321     SmallVector<SDValue, 2> Ops;
9322     unsigned Opc;
9323     bool IsInlinableBarID = false;
9324     int64_t BarID;
9325 
9326     if (isa<ConstantSDNode>(Op->getOperand(2))) {
9327       BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9328       IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9329     }
9330 
9331     if (IsInlinableBarID) {
9332       Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9333       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9334       Ops.push_back(K);
9335     } else {
9336       Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9337       SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9338       Ops.push_back(M0Val.getValue(0));
9339     }
9340 
9341     auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9342     return SDValue(NewMI, 0);
9343   }
9344   default:
9345 
9346     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9347             AMDGPU::getImageDimIntrinsicInfo(IntrID))
9348       return lowerImage(Op, ImageDimIntr, DAG, true);
9349 
9350     return SDValue();
9351   }
9352 }
9353 
9354 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9355 // dwordx4 if on SI and handle TFE loads.
9356 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9357                                               SDVTList VTList,
9358                                               ArrayRef<SDValue> Ops, EVT MemVT,
9359                                               MachineMemOperand *MMO,
9360                                               SelectionDAG &DAG) const {
9361   LLVMContext &C = *DAG.getContext();
9362   MachineFunction &MF = DAG.getMachineFunction();
9363   EVT VT = VTList.VTs[0];
9364 
9365   assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9366   bool IsTFE = VTList.NumVTs == 3;
9367   if (IsTFE) {
9368     unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9369     unsigned NumOpDWords = NumValueDWords + 1;
9370     EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9371     SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9372     MachineMemOperand *OpDWordsMMO =
9373         MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9374     SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9375                                      OpDWordsVT, OpDWordsMMO, DAG);
9376     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
9377                                  DAG.getVectorIdxConstant(NumValueDWords, DL));
9378     SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9379     SDValue ValueDWords =
9380         NumValueDWords == 1
9381             ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9382             : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
9383                           EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9384                           ZeroIdx);
9385     SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9386     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9387   }
9388 
9389   if (!Subtarget->hasDwordx3LoadStores() &&
9390       (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9391     EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9392     EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9393     MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9394     SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9395     SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9396                                          WidenedMemVT, WidenedMMO);
9397     SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
9398                                 DAG.getVectorIdxConstant(0, DL));
9399     return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9400   }
9401 
9402   return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9403 }
9404 
9405 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9406                                          bool ImageStore) const {
9407   EVT StoreVT = VData.getValueType();
9408 
9409   // No change for f16 and legal vector D16 types.
9410   if (!StoreVT.isVector())
9411     return VData;
9412 
9413   SDLoc DL(VData);
9414   unsigned NumElements = StoreVT.getVectorNumElements();
9415 
9416   if (Subtarget->hasUnpackedD16VMem()) {
9417     // We need to unpack the packed data to store.
9418     EVT IntStoreVT = StoreVT.changeTypeToInteger();
9419     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9420 
9421     EVT EquivStoreVT =
9422         EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9423     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9424     return DAG.UnrollVectorOp(ZExt.getNode());
9425   }
9426 
9427   // The sq block of gfx8.1 does not estimate register use correctly for d16
9428   // image store instructions. The data operand is computed as if it were not a
9429   // d16 image instruction.
9430   if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9431     // Bitcast to i16
9432     EVT IntStoreVT = StoreVT.changeTypeToInteger();
9433     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9434 
9435     // Decompose into scalars
9436     SmallVector<SDValue, 4> Elts;
9437     DAG.ExtractVectorElements(IntVData, Elts);
9438 
9439     // Group pairs of i16 into v2i16 and bitcast to i32
9440     SmallVector<SDValue, 4> PackedElts;
9441     for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9442       SDValue Pair =
9443           DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9444       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9445       PackedElts.push_back(IntPair);
9446     }
9447     if ((NumElements % 2) == 1) {
9448       // Handle v3i16
9449       unsigned I = Elts.size() / 2;
9450       SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9451                                         {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9452       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9453       PackedElts.push_back(IntPair);
9454     }
9455 
9456     // Pad using UNDEF
9457     PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9458 
9459     // Build final vector
9460     EVT VecVT =
9461         EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9462     return DAG.getBuildVector(VecVT, DL, PackedElts);
9463   }
9464 
9465   if (NumElements == 3) {
9466     EVT IntStoreVT =
9467         EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
9468     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9469 
9470     EVT WidenedStoreVT = EVT::getVectorVT(
9471         *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9472     EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9473                                          WidenedStoreVT.getStoreSizeInBits());
9474     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9475     return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9476   }
9477 
9478   assert(isTypeLegal(StoreVT));
9479   return VData;
9480 }
9481 
9482 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9483                                               SelectionDAG &DAG) const {
9484   SDLoc DL(Op);
9485   SDValue Chain = Op.getOperand(0);
9486   unsigned IntrinsicID = Op.getConstantOperandVal(1);
9487   MachineFunction &MF = DAG.getMachineFunction();
9488 
9489   switch (IntrinsicID) {
9490   case Intrinsic::amdgcn_exp_compr: {
9491     if (!Subtarget->hasCompressedExport()) {
9492       DiagnosticInfoUnsupported BadIntrin(
9493           DAG.getMachineFunction().getFunction(),
9494           "intrinsic not supported on subtarget", DL.getDebugLoc());
9495       DAG.getContext()->diagnose(BadIntrin);
9496     }
9497     SDValue Src0 = Op.getOperand(4);
9498     SDValue Src1 = Op.getOperand(5);
9499     // Hack around illegal type on SI by directly selecting it.
9500     if (isTypeLegal(Src0.getValueType()))
9501       return SDValue();
9502 
9503     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9504     SDValue Undef = DAG.getUNDEF(MVT::f32);
9505     const SDValue Ops[] = {
9506       Op.getOperand(2), // tgt
9507       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9508       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9509       Undef, // src2
9510       Undef, // src3
9511       Op.getOperand(7), // vm
9512       DAG.getTargetConstant(1, DL, MVT::i1), // compr
9513       Op.getOperand(3), // en
9514       Op.getOperand(0) // Chain
9515     };
9516 
9517     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9519   }
9520   case Intrinsic::amdgcn_s_barrier: {
9521     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
9522     if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9523       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9524       if (WGSize <= ST.getWavefrontSize())
9525         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9526                                           Op.getOperand(0)), 0);
9527     }
9528 
9529     // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9530     if (ST.hasSplitBarriers()) {
9531       SDValue K =
9532           DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
9533       SDValue BarSignal =
9534           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9535                                      MVT::Other, K, Op.getOperand(0)),
9536                   0);
9537       SDValue BarWait =
9538           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9539                                      BarSignal.getValue(0)),
9540                   0);
9541       return BarWait;
9542     }
9543 
9544     return SDValue();
9545   };
9546 
9547   case Intrinsic::amdgcn_struct_tbuffer_store:
9548   case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9549     SDValue VData = Op.getOperand(2);
9550     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9551     if (IsD16)
9552       VData = handleD16VData(VData, DAG);
9553     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9554     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9555     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9556     SDValue Ops[] = {
9557         Chain,
9558         VData,                                 // vdata
9559         Rsrc,                                  // rsrc
9560         Op.getOperand(4),                      // vindex
9561         Offsets.first,                         // voffset
9562         SOffset,                               // soffset
9563         Offsets.second,                        // offset
9564         Op.getOperand(7),                      // format
9565         Op.getOperand(8),                      // cachepolicy, swizzled buffer
9566         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9567     };
9568     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9569                            AMDGPUISD::TBUFFER_STORE_FORMAT;
9570     MemSDNode *M = cast<MemSDNode>(Op);
9571     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9572                                    M->getMemoryVT(), M->getMemOperand());
9573   }
9574 
9575   case Intrinsic::amdgcn_raw_tbuffer_store:
9576   case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9577     SDValue VData = Op.getOperand(2);
9578     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9579     if (IsD16)
9580       VData = handleD16VData(VData, DAG);
9581     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9582     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9583     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9584     SDValue Ops[] = {
9585         Chain,
9586         VData,                                 // vdata
9587         Rsrc,                                  // rsrc
9588         DAG.getConstant(0, DL, MVT::i32),      // vindex
9589         Offsets.first,                         // voffset
9590         SOffset,                               // soffset
9591         Offsets.second,                        // offset
9592         Op.getOperand(6),                      // format
9593         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9594         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9595     };
9596     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9597                            AMDGPUISD::TBUFFER_STORE_FORMAT;
9598     MemSDNode *M = cast<MemSDNode>(Op);
9599     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9600                                    M->getMemoryVT(), M->getMemOperand());
9601   }
9602 
9603   case Intrinsic::amdgcn_raw_buffer_store:
9604   case Intrinsic::amdgcn_raw_ptr_buffer_store:
9605   case Intrinsic::amdgcn_raw_buffer_store_format:
9606   case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9607     const bool IsFormat =
9608         IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9609         IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9610 
9611     SDValue VData = Op.getOperand(2);
9612     EVT VDataVT = VData.getValueType();
9613     EVT EltType = VDataVT.getScalarType();
9614     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9615     if (IsD16) {
9616       VData = handleD16VData(VData, DAG);
9617       VDataVT = VData.getValueType();
9618     }
9619 
9620     if (!isTypeLegal(VDataVT)) {
9621       VData =
9622           DAG.getNode(ISD::BITCAST, DL,
9623                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9624     }
9625 
9626     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9627     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9628     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9629     SDValue Ops[] = {
9630         Chain,
9631         VData,
9632         Rsrc,
9633         DAG.getConstant(0, DL, MVT::i32),      // vindex
9634         Offsets.first,                         // voffset
9635         SOffset,                               // soffset
9636         Offsets.second,                        // offset
9637         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9638         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9639     };
9640     unsigned Opc =
9641         IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
9642     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9643     MemSDNode *M = cast<MemSDNode>(Op);
9644 
9645     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9646     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9647       return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9648 
9649     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9650                                    M->getMemoryVT(), M->getMemOperand());
9651   }
9652 
9653   case Intrinsic::amdgcn_struct_buffer_store:
9654   case Intrinsic::amdgcn_struct_ptr_buffer_store:
9655   case Intrinsic::amdgcn_struct_buffer_store_format:
9656   case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9657     const bool IsFormat =
9658         IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9659         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9660 
9661     SDValue VData = Op.getOperand(2);
9662     EVT VDataVT = VData.getValueType();
9663     EVT EltType = VDataVT.getScalarType();
9664     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9665 
9666     if (IsD16) {
9667       VData = handleD16VData(VData, DAG);
9668       VDataVT = VData.getValueType();
9669     }
9670 
9671     if (!isTypeLegal(VDataVT)) {
9672       VData =
9673           DAG.getNode(ISD::BITCAST, DL,
9674                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9675     }
9676 
9677     auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9678     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9679     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9680     SDValue Ops[] = {
9681         Chain,
9682         VData,
9683         Rsrc,
9684         Op.getOperand(4),                      // vindex
9685         Offsets.first,                         // voffset
9686         SOffset,                               // soffset
9687         Offsets.second,                        // offset
9688         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9689         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9690     };
9691     unsigned Opc =
9692         !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
9693     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9694     MemSDNode *M = cast<MemSDNode>(Op);
9695 
9696     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9697     EVT VDataType = VData.getValueType().getScalarType();
9698     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9699       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9700 
9701     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9702                                    M->getMemoryVT(), M->getMemOperand());
9703   }
9704   case Intrinsic::amdgcn_raw_buffer_load_lds:
9705   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9706   case Intrinsic::amdgcn_struct_buffer_load_lds:
9707   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9708     assert(!AMDGPU::isGFX12Plus(*Subtarget));
9709     unsigned Opc;
9710     bool HasVIndex =
9711         IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9712         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9713     unsigned OpOffset = HasVIndex ? 1 : 0;
9714     SDValue VOffset = Op.getOperand(5 + OpOffset);
9715     bool HasVOffset = !isNullConstant(VOffset);
9716     unsigned Size = Op->getConstantOperandVal(4);
9717 
9718     switch (Size) {
9719     default:
9720       return SDValue();
9721     case 1:
9722       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9723                                    : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9724                       : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9725                                    : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9726       break;
9727     case 2:
9728       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9729                                    : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9730                       : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9731                                    : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9732       break;
9733     case 4:
9734       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9735                                    : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9736                       : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9737                                    : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9738       break;
9739     }
9740 
9741     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9742 
9743     SmallVector<SDValue, 8> Ops;
9744 
9745     if (HasVIndex && HasVOffset)
9746       Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9747                                        { Op.getOperand(5), // VIndex
9748                                          VOffset }));
9749     else if (HasVIndex)
9750       Ops.push_back(Op.getOperand(5));
9751     else if (HasVOffset)
9752       Ops.push_back(VOffset);
9753 
9754     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9755     Ops.push_back(Rsrc);
9756     Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9757     Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9758     unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9759     Ops.push_back(
9760       DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9761     Ops.push_back(DAG.getTargetConstant(
9762         Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9763     Ops.push_back(M0Val.getValue(0)); // Chain
9764     Ops.push_back(M0Val.getValue(1)); // Glue
9765 
9766     auto *M = cast<MemSDNode>(Op);
9767     MachineMemOperand *LoadMMO = M->getMemOperand();
9768     // Don't set the offset value here because the pointer points to the base of
9769     // the buffer.
9770     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9771 
9772     MachinePointerInfo StorePtrI = LoadPtrI;
9773     LoadPtrI.V = PoisonValue::get(
9774         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
9775     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9776     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9777 
9778     auto F = LoadMMO->getFlags() &
9779              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9780     LoadMMO =
9781         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
9782                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9783 
9784     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9785         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9786         LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9787 
9788     auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9789     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9790 
9791     return SDValue(Load, 0);
9792   }
9793   case Intrinsic::amdgcn_global_load_lds: {
9794     unsigned Opc;
9795     unsigned Size = Op->getConstantOperandVal(4);
9796     switch (Size) {
9797     default:
9798       return SDValue();
9799     case 1:
9800       Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9801       break;
9802     case 2:
9803       Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9804       break;
9805     case 4:
9806       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9807       break;
9808     }
9809 
9810     auto *M = cast<MemSDNode>(Op);
9811     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9812 
9813     SmallVector<SDValue, 6> Ops;
9814 
9815     SDValue Addr = Op.getOperand(2); // Global ptr
9816     SDValue VOffset;
9817     // Try to split SAddr and VOffset. Global and LDS pointers share the same
9818     // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9819     if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9820       SDValue LHS = Addr.getOperand(0);
9821       SDValue RHS = Addr.getOperand(1);
9822 
9823       if (LHS->isDivergent())
9824         std::swap(LHS, RHS);
9825 
9826       if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9827           RHS.getOperand(0).getValueType() == MVT::i32) {
9828         // add (i64 sgpr), (zero_extend (i32 vgpr))
9829         Addr = LHS;
9830         VOffset = RHS.getOperand(0);
9831       }
9832     }
9833 
9834     Ops.push_back(Addr);
9835     if (!Addr->isDivergent()) {
9836       Opc = AMDGPU::getGlobalSaddrOp(Opc);
9837       if (!VOffset)
9838         VOffset = SDValue(
9839             DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9840                                DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9841       Ops.push_back(VOffset);
9842     }
9843 
9844     Ops.push_back(Op.getOperand(5));  // Offset
9845     Ops.push_back(Op.getOperand(6));  // CPol
9846     Ops.push_back(M0Val.getValue(0)); // Chain
9847     Ops.push_back(M0Val.getValue(1)); // Glue
9848 
9849     MachineMemOperand *LoadMMO = M->getMemOperand();
9850     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9851     LoadPtrI.Offset = Op->getConstantOperandVal(5);
9852     MachinePointerInfo StorePtrI = LoadPtrI;
9853     LoadPtrI.V = PoisonValue::get(
9854         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
9855     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9856     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9857     auto F = LoadMMO->getFlags() &
9858              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9859     LoadMMO =
9860         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
9861                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9862     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9863         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9864         LoadMMO->getAAInfo());
9865 
9866     auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9867     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9868 
9869     return SDValue(Load, 0);
9870   }
9871   case Intrinsic::amdgcn_end_cf:
9872     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9873                                       Op->getOperand(2), Chain), 0);
9874   case Intrinsic::amdgcn_s_barrier_init:
9875   case Intrinsic::amdgcn_s_barrier_join:
9876   case Intrinsic::amdgcn_s_wakeup_barrier: {
9877     SDValue Chain = Op->getOperand(0);
9878     SmallVector<SDValue, 2> Ops;
9879     SDValue BarOp = Op->getOperand(2);
9880     unsigned Opc;
9881     bool IsInlinableBarID = false;
9882     int64_t BarVal;
9883 
9884     if (isa<ConstantSDNode>(BarOp)) {
9885       BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9886       IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9887     }
9888 
9889     if (IsInlinableBarID) {
9890       switch (IntrinsicID) {
9891       default:
9892         return SDValue();
9893       case Intrinsic::amdgcn_s_barrier_init:
9894         Opc = AMDGPU::S_BARRIER_INIT_IMM;
9895         break;
9896       case Intrinsic::amdgcn_s_barrier_join:
9897         Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9898         break;
9899       case Intrinsic::amdgcn_s_wakeup_barrier:
9900         Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9901         break;
9902       }
9903 
9904       SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9905       Ops.push_back(K);
9906     } else {
9907       switch (IntrinsicID) {
9908       default:
9909         return SDValue();
9910       case Intrinsic::amdgcn_s_barrier_init:
9911         Opc = AMDGPU::S_BARRIER_INIT_M0;
9912         break;
9913       case Intrinsic::amdgcn_s_barrier_join:
9914         Opc = AMDGPU::S_BARRIER_JOIN_M0;
9915         break;
9916       case Intrinsic::amdgcn_s_wakeup_barrier:
9917         Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9918         break;
9919       }
9920     }
9921 
9922     if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9923       SDValue M0Val;
9924       // Member count will be read from M0[16:22]
9925       M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9926                           DAG.getShiftAmountConstant(16, MVT::i32, DL));
9927 
9928       if (!IsInlinableBarID) {
9929         // If reference to barrier id is not an inline constant then it must be
9930         // referenced with M0[4:0]. Perform an OR with the member count to
9931         // include it in M0.
9932         M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9933                                            Op.getOperand(2), M0Val),
9934                         0);
9935       }
9936       Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9937     } else if (!IsInlinableBarID) {
9938       Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9939     }
9940 
9941     auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9942     return SDValue(NewMI, 0);
9943   }
9944   default: {
9945     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9946             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
9947       return lowerImage(Op, ImageDimIntr, DAG, true);
9948 
9949     return Op;
9950   }
9951   }
9952 }
9953 
9954 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9955 // offset (the offset that is included in bounds checking and swizzling, to be
9956 // split between the instruction's voffset and immoffset fields) and soffset
9957 // (the offset that is excluded from bounds checking and swizzling, to go in
9958 // the instruction's soffset field).  This function takes the first kind of
9959 // offset and figures out how to split it between voffset and immoffset.
9960 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9961     SDValue Offset, SelectionDAG &DAG) const {
9962   SDLoc DL(Offset);
9963   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9964   SDValue N0 = Offset;
9965   ConstantSDNode *C1 = nullptr;
9966 
9967   if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9968     N0 = SDValue();
9969   else if (DAG.isBaseWithConstantOffset(N0)) {
9970     C1 = cast<ConstantSDNode>(N0.getOperand(1));
9971     N0 = N0.getOperand(0);
9972   }
9973 
9974   if (C1) {
9975     unsigned ImmOffset = C1->getZExtValue();
9976     // If the immediate value is too big for the immoffset field, put only bits
9977     // that would normally fit in the immoffset field. The remaining value that
9978     // is copied/added for the voffset field is a large power of 2, and it
9979     // stands more chance of being CSEd with the copy/add for another similar
9980     // load/store.
9981     // However, do not do that rounding down if that is a negative
9982     // number, as it appears to be illegal to have a negative offset in the
9983     // vgpr, even if adding the immediate offset makes it positive.
9984     unsigned Overflow = ImmOffset & ~MaxImm;
9985     ImmOffset -= Overflow;
9986     if ((int32_t)Overflow < 0) {
9987       Overflow += ImmOffset;
9988       ImmOffset = 0;
9989     }
9990     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9991     if (Overflow) {
9992       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9993       if (!N0)
9994         N0 = OverflowVal;
9995       else {
9996         SDValue Ops[] = { N0, OverflowVal };
9997         N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9998       }
9999     }
10000   }
10001   if (!N0)
10002     N0 = DAG.getConstant(0, DL, MVT::i32);
10003   if (!C1)
10004     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10005   return {N0, SDValue(C1, 0)};
10006 }
10007 
10008 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10009 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10010 // pointed to by Offsets.
10011 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10012                                         SelectionDAG &DAG, SDValue *Offsets,
10013                                         Align Alignment) const {
10014   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10015   SDLoc DL(CombinedOffset);
10016   if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10017     uint32_t Imm = C->getZExtValue();
10018     uint32_t SOffset, ImmOffset;
10019     if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10020       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10021       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10022       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10023       return;
10024     }
10025   }
10026   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10027     SDValue N0 = CombinedOffset.getOperand(0);
10028     SDValue N1 = CombinedOffset.getOperand(1);
10029     uint32_t SOffset, ImmOffset;
10030     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10031     if (Offset >= 0 &&
10032         TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10033       Offsets[0] = N0;
10034       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10035       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10036       return;
10037     }
10038   }
10039 
10040   SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10041                             ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10042                             : DAG.getConstant(0, DL, MVT::i32);
10043 
10044   Offsets[0] = CombinedOffset;
10045   Offsets[1] = SOffsetZero;
10046   Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10047 }
10048 
10049 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10050                                                 SelectionDAG &DAG) const {
10051   if (!MaybePointer.getValueType().isScalarInteger())
10052     return MaybePointer;
10053 
10054   SDLoc DL(MaybePointer);
10055 
10056   SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10057   return Rsrc;
10058 }
10059 
10060 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10061 // specified in the intrinsic.
10062 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10063                                                    SelectionDAG &DAG) const {
10064   SDLoc Loc(Op);
10065 
10066   SDValue Pointer = Op->getOperand(1);
10067   SDValue Stride = Op->getOperand(2);
10068   SDValue NumRecords = Op->getOperand(3);
10069   SDValue Flags = Op->getOperand(4);
10070 
10071   auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10072   SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10073   SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10074   std::optional<uint32_t> ConstStride = std::nullopt;
10075   if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10076     ConstStride = ConstNode->getZExtValue();
10077 
10078   SDValue NewHighHalf = Masked;
10079   if (!ConstStride || *ConstStride != 0) {
10080     SDValue ShiftedStride;
10081     if (ConstStride) {
10082       ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10083     } else {
10084       SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10085       ShiftedStride =
10086           DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10087                       DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10088     }
10089     NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10090   }
10091 
10092   SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10093                              NewHighHalf, NumRecords, Flags);
10094   SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10095   return RsrcPtr;
10096 }
10097 
10098 // Handle 8 bit and 16 bit buffer loads
10099 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10100                                                      EVT LoadVT, SDLoc DL,
10101                                                      ArrayRef<SDValue> Ops,
10102                                                      MachineMemOperand *MMO,
10103                                                      bool IsTFE) const {
10104   EVT IntVT = LoadVT.changeTypeToInteger();
10105 
10106   if (IsTFE) {
10107     unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10108                        ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10109                        : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10110     MachineFunction &MF = DAG.getMachineFunction();
10111     MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10112     SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10113     SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10114     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10115                                  DAG.getConstant(1, DL, MVT::i32));
10116     SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10117                                DAG.getConstant(0, DL, MVT::i32));
10118     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10119     SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10120     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10121   }
10122 
10123   unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10124          AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
10125 
10126   SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10127   SDValue BufferLoad =
10128       DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10129   SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10130   LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10131 
10132   return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10133 }
10134 
10135 // Handle 8 bit and 16 bit buffer stores
10136 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10137                                                       EVT VDataType, SDLoc DL,
10138                                                       SDValue Ops[],
10139                                                       MemSDNode *M) const {
10140   if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10141     Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10142 
10143   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10144   Ops[1] = BufferStoreExt;
10145   unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10146                                  AMDGPUISD::BUFFER_STORE_SHORT;
10147   ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10148   return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10149                                      M->getMemOperand());
10150 }
10151 
10152 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
10153                                  ISD::LoadExtType ExtType, SDValue Op,
10154                                  const SDLoc &SL, EVT VT) {
10155   if (VT.bitsLT(Op.getValueType()))
10156     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10157 
10158   switch (ExtType) {
10159   case ISD::SEXTLOAD:
10160     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10161   case ISD::ZEXTLOAD:
10162     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10163   case ISD::EXTLOAD:
10164     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10165   case ISD::NON_EXTLOAD:
10166     return Op;
10167   }
10168 
10169   llvm_unreachable("invalid ext type");
10170 }
10171 
10172 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10174 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10175   SelectionDAG &DAG = DCI.DAG;
10176   if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10177     return SDValue();
10178 
10179   // FIXME: Constant loads should all be marked invariant.
10180   unsigned AS = Ld->getAddressSpace();
10181   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10182       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10183       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10184     return SDValue();
10185 
10186   // Don't do this early, since it may interfere with adjacent load merging for
10187   // illegal types. We can avoid losing alignment information for exotic types
10188   // pre-legalize.
10189   EVT MemVT = Ld->getMemoryVT();
10190   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10191       MemVT.getSizeInBits() >= 32)
10192     return SDValue();
10193 
10194   SDLoc SL(Ld);
10195 
10196   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10197          "unexpected vector extload");
10198 
10199   // TODO: Drop only high part of range.
10200   SDValue Ptr = Ld->getBasePtr();
10201   SDValue NewLoad = DAG.getLoad(
10202       ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10203       Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10204       Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10205       nullptr); // Drop ranges
10206 
10207   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10208   if (MemVT.isFloatingPoint()) {
10209     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10210            "unexpected fp extload");
10211     TruncVT = MemVT.changeTypeToInteger();
10212   }
10213 
10214   SDValue Cvt = NewLoad;
10215   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10216     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10217                       DAG.getValueType(TruncVT));
10218   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10219              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10220     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10221   } else {
10222     assert(Ld->getExtensionType() == ISD::EXTLOAD);
10223   }
10224 
10225   EVT VT = Ld->getValueType(0);
10226   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10227 
10228   DCI.AddToWorklist(Cvt.getNode());
10229 
10230   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10231   // the appropriate extension from the 32-bit load.
10232   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10233   DCI.AddToWorklist(Cvt.getNode());
10234 
10235   // Handle conversion back to floating point if necessary.
10236   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10237 
10238   return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10239 }
10240 
10241 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10242                                           const SIMachineFunctionInfo &Info) {
10243   // TODO: Should check if the address can definitely not access stack.
10244   if (Info.isEntryFunction())
10245     return Info.getUserSGPRInfo().hasFlatScratchInit();
10246   return true;
10247 }
10248 
10249 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10250   SDLoc DL(Op);
10251   LoadSDNode *Load = cast<LoadSDNode>(Op);
10252   ISD::LoadExtType ExtType = Load->getExtensionType();
10253   EVT MemVT = Load->getMemoryVT();
10254 
10255   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10256     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10257       return SDValue();
10258 
10259     // FIXME: Copied from PPC
10260     // First, load into 32 bits, then truncate to 1 bit.
10261 
10262     SDValue Chain = Load->getChain();
10263     SDValue BasePtr = Load->getBasePtr();
10264     MachineMemOperand *MMO = Load->getMemOperand();
10265 
10266     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10267 
10268     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10269                                    BasePtr, RealMemVT, MMO);
10270 
10271     if (!MemVT.isVector()) {
10272       SDValue Ops[] = {
10273         DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10274         NewLD.getValue(1)
10275       };
10276 
10277       return DAG.getMergeValues(Ops, DL);
10278     }
10279 
10280     SmallVector<SDValue, 3> Elts;
10281     for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10282       SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10283                                 DAG.getConstant(I, DL, MVT::i32));
10284 
10285       Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10286     }
10287 
10288     SDValue Ops[] = {
10289       DAG.getBuildVector(MemVT, DL, Elts),
10290       NewLD.getValue(1)
10291     };
10292 
10293     return DAG.getMergeValues(Ops, DL);
10294   }
10295 
10296   if (!MemVT.isVector())
10297     return SDValue();
10298 
10299   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10300          "Custom lowering for non-i32 vectors hasn't been implemented.");
10301 
10302   Align Alignment = Load->getAlign();
10303   unsigned AS = Load->getAddressSpace();
10304   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10305       Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10306     return SplitVectorLoad(Op, DAG);
10307   }
10308 
10309   MachineFunction &MF = DAG.getMachineFunction();
10310   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10311   // If there is a possibility that flat instruction access scratch memory
10312   // then we need to use the same legalization rules we use for private.
10313   if (AS == AMDGPUAS::FLAT_ADDRESS &&
10314       !Subtarget->hasMultiDwordFlatScratchAddressing())
10315     AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10316          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10317 
10318   unsigned NumElements = MemVT.getVectorNumElements();
10319 
10320   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10321       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
10322     if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10323       if (MemVT.isPow2VectorType() ||
10324           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10325         return SDValue();
10326       return WidenOrSplitVectorLoad(Op, DAG);
10327     }
10328     // Non-uniform loads will be selected to MUBUF instructions, so they
10329     // have the same legalization requirements as global and private
10330     // loads.
10331     //
10332   }
10333 
10334   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10335       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10336       AS == AMDGPUAS::GLOBAL_ADDRESS) {
10337     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10338         Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10339         Alignment >= Align(4) && NumElements < 32) {
10340       if (MemVT.isPow2VectorType() ||
10341           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10342         return SDValue();
10343       return WidenOrSplitVectorLoad(Op, DAG);
10344     }
10345     // Non-uniform loads will be selected to MUBUF instructions, so they
10346     // have the same legalization requirements as global and private
10347     // loads.
10348     //
10349   }
10350   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10351       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10352       AS == AMDGPUAS::GLOBAL_ADDRESS ||
10353       AS == AMDGPUAS::FLAT_ADDRESS) {
10354     if (NumElements > 4)
10355       return SplitVectorLoad(Op, DAG);
10356     // v3 loads not supported on SI.
10357     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10358       return WidenOrSplitVectorLoad(Op, DAG);
10359 
10360     // v3 and v4 loads are supported for private and global memory.
10361     return SDValue();
10362   }
10363   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10364     // Depending on the setting of the private_element_size field in the
10365     // resource descriptor, we can only make private accesses up to a certain
10366     // size.
10367     switch (Subtarget->getMaxPrivateElementSize()) {
10368     case 4: {
10369       SDValue Ops[2];
10370       std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10371       return DAG.getMergeValues(Ops, DL);
10372     }
10373     case 8:
10374       if (NumElements > 2)
10375         return SplitVectorLoad(Op, DAG);
10376       return SDValue();
10377     case 16:
10378       // Same as global/flat
10379       if (NumElements > 4)
10380         return SplitVectorLoad(Op, DAG);
10381       // v3 loads not supported on SI.
10382       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10383         return WidenOrSplitVectorLoad(Op, DAG);
10384 
10385       return SDValue();
10386     default:
10387       llvm_unreachable("unsupported private_element_size");
10388     }
10389   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10390     unsigned Fast = 0;
10391     auto Flags = Load->getMemOperand()->getFlags();
10392     if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
10393                                            Load->getAlign(), Flags, &Fast) &&
10394         Fast > 1)
10395       return SDValue();
10396 
10397     if (MemVT.isVector())
10398       return SplitVectorLoad(Op, DAG);
10399   }
10400 
10401   if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
10402                                       MemVT, *Load->getMemOperand())) {
10403     SDValue Ops[2];
10404     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10405     return DAG.getMergeValues(Ops, DL);
10406   }
10407 
10408   return SDValue();
10409 }
10410 
10411 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10412   EVT VT = Op.getValueType();
10413   if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10414       VT.getSizeInBits() == 512)
10415     return splitTernaryVectorOp(Op, DAG);
10416 
10417   assert(VT.getSizeInBits() == 64);
10418 
10419   SDLoc DL(Op);
10420   SDValue Cond = Op.getOperand(0);
10421 
10422   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10423   SDValue One = DAG.getConstant(1, DL, MVT::i32);
10424 
10425   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10426   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10427 
10428   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10429   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10430 
10431   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10432 
10433   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10434   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10435 
10436   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10437 
10438   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10439   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10440 }
10441 
10442 // Catch division cases where we can use shortcuts with rcp and rsq
10443 // instructions.
10444 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10445                                               SelectionDAG &DAG) const {
10446   SDLoc SL(Op);
10447   SDValue LHS = Op.getOperand(0);
10448   SDValue RHS = Op.getOperand(1);
10449   EVT VT = Op.getValueType();
10450   const SDNodeFlags Flags = Op->getFlags();
10451 
10452   bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10453                             DAG.getTarget().Options.UnsafeFPMath;
10454 
10455   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10456     // Without !fpmath accuracy information, we can't do more because we don't
10457     // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10458     // f16 is always accurate enough
10459     if (!AllowInaccurateRcp && VT != MVT::f16)
10460       return SDValue();
10461 
10462     if (CLHS->isExactlyValue(1.0)) {
10463       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10464       // the CI documentation has a worst case error of 1 ulp.
10465       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10466       // use it as long as we aren't trying to use denormals.
10467       //
10468       // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10469 
10470       // 1.0 / sqrt(x) -> rsq(x)
10471 
10472       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10473       // error seems really high at 2^29 ULP.
10474       // 1.0 / x -> rcp(x)
10475       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10476     }
10477 
10478     // Same as for 1.0, but expand the sign out of the constant.
10479     if (CLHS->isExactlyValue(-1.0)) {
10480       // -1.0 / x -> rcp (fneg x)
10481       SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10482       return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10483     }
10484   }
10485 
10486   // For f16 require afn or arcp.
10487   // For f32 require afn.
10488   if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10489     return SDValue();
10490 
10491   // Turn into multiply by the reciprocal.
10492   // x / y -> x * (1.0 / y)
10493   SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10494   return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10495 }
10496 
10497 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10498                                                 SelectionDAG &DAG) const {
10499   SDLoc SL(Op);
10500   SDValue X = Op.getOperand(0);
10501   SDValue Y = Op.getOperand(1);
10502   EVT VT = Op.getValueType();
10503   const SDNodeFlags Flags = Op->getFlags();
10504 
10505   bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10506                             DAG.getTarget().Options.UnsafeFPMath;
10507   if (!AllowInaccurateDiv)
10508     return SDValue();
10509 
10510   SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10511   SDValue One = DAG.getConstantFP(1.0, SL, VT);
10512 
10513   SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10514   SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10515 
10516   R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10517   SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10518   R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10519   SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10520   SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10521   return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10522 }
10523 
10524 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10525                           EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10526                           SDNodeFlags Flags) {
10527   if (GlueChain->getNumValues() <= 1) {
10528     return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10529   }
10530 
10531   assert(GlueChain->getNumValues() == 3);
10532 
10533   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10534   switch (Opcode) {
10535   default: llvm_unreachable("no chain equivalent for opcode");
10536   case ISD::FMUL:
10537     Opcode = AMDGPUISD::FMUL_W_CHAIN;
10538     break;
10539   }
10540 
10541   return DAG.getNode(Opcode, SL, VTList,
10542                      {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10543                      Flags);
10544 }
10545 
10546 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10547                            EVT VT, SDValue A, SDValue B, SDValue C,
10548                            SDValue GlueChain, SDNodeFlags Flags) {
10549   if (GlueChain->getNumValues() <= 1) {
10550     return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10551   }
10552 
10553   assert(GlueChain->getNumValues() == 3);
10554 
10555   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10556   switch (Opcode) {
10557   default: llvm_unreachable("no chain equivalent for opcode");
10558   case ISD::FMA:
10559     Opcode = AMDGPUISD::FMA_W_CHAIN;
10560     break;
10561   }
10562 
10563   return DAG.getNode(Opcode, SL, VTList,
10564                      {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10565                      Flags);
10566 }
10567 
10568 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10569   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10570     return FastLowered;
10571 
10572   SDLoc SL(Op);
10573   SDValue Src0 = Op.getOperand(0);
10574   SDValue Src1 = Op.getOperand(1);
10575 
10576   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10577   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10578 
10579   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10580   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10581 
10582   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10583   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10584 
10585   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10586 }
10587 
10588 // Faster 2.5 ULP division that does not support denormals.
10589 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10590   SDNodeFlags Flags = Op->getFlags();
10591   SDLoc SL(Op);
10592   SDValue LHS = Op.getOperand(1);
10593   SDValue RHS = Op.getOperand(2);
10594 
10595   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10596 
10597   const APFloat K0Val(0x1p+96f);
10598   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10599 
10600   const APFloat K1Val(0x1p-32f);
10601   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10602 
10603   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10604 
10605   EVT SetCCVT =
10606     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10607 
10608   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10609 
10610   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10611 
10612   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10613 
10614   // rcp does not support denormals.
10615   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10616 
10617   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10618 
10619   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10620 }
10621 
10622 // Returns immediate value for setting the F32 denorm mode when using the
10623 // S_DENORM_MODE instruction.
10624 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
10625                                     const SIMachineFunctionInfo *Info,
10626                                     const GCNSubtarget *ST) {
10627   assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10628   uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10629   uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10630   return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10631 }
10632 
10633 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10634   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10635     return FastLowered;
10636 
10637   // The selection matcher assumes anything with a chain selecting to a
10638   // mayRaiseFPException machine instruction. Since we're introducing a chain
10639   // here, we need to explicitly report nofpexcept for the regular fdiv
10640   // lowering.
10641   SDNodeFlags Flags = Op->getFlags();
10642   Flags.setNoFPExcept(true);
10643 
10644   SDLoc SL(Op);
10645   SDValue LHS = Op.getOperand(0);
10646   SDValue RHS = Op.getOperand(1);
10647 
10648   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10649 
10650   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10651 
10652   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10653                                           {RHS, RHS, LHS}, Flags);
10654   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10655                                         {LHS, RHS, LHS}, Flags);
10656 
10657   // Denominator is scaled to not be denormal, so using rcp is ok.
10658   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10659                                   DenominatorScaled, Flags);
10660   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10661                                      DenominatorScaled, Flags);
10662 
10663   using namespace AMDGPU::Hwreg;
10664   const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10665   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10666 
10667   const MachineFunction &MF = DAG.getMachineFunction();
10668   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10669   const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10670 
10671   const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10672   const bool HasDynamicDenormals =
10673       (DenormMode.Input == DenormalMode::Dynamic) ||
10674       (DenormMode.Output == DenormalMode::Dynamic);
10675 
10676   SDValue SavedDenormMode;
10677 
10678   if (!PreservesDenormals) {
10679     // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10680     // lowering. The chain dependence is insufficient, and we need glue. We do
10681     // not need the glue variants in a strictfp function.
10682 
10683     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10684 
10685     SDValue Glue = DAG.getEntryNode();
10686     if (HasDynamicDenormals) {
10687       SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10688                                           DAG.getVTList(MVT::i32, MVT::Glue),
10689                                           {BitField, Glue});
10690       SavedDenormMode = SDValue(GetReg, 0);
10691 
10692       Glue = DAG.getMergeValues(
10693           {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10694     }
10695 
10696     SDNode *EnableDenorm;
10697     if (Subtarget->hasDenormModeInst()) {
10698       const SDValue EnableDenormValue =
10699           getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10700 
10701       EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10702                                  EnableDenormValue)
10703                          .getNode();
10704     } else {
10705       const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10706                                                         SL, MVT::i32);
10707       EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10708                                         {EnableDenormValue, BitField, Glue});
10709     }
10710 
10711     SDValue Ops[3] = {
10712       NegDivScale0,
10713       SDValue(EnableDenorm, 0),
10714       SDValue(EnableDenorm, 1)
10715     };
10716 
10717     NegDivScale0 = DAG.getMergeValues(Ops, SL);
10718   }
10719 
10720   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10721                              ApproxRcp, One, NegDivScale0, Flags);
10722 
10723   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10724                              ApproxRcp, Fma0, Flags);
10725 
10726   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10727                            Fma1, Fma1, Flags);
10728 
10729   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10730                              NumeratorScaled, Mul, Flags);
10731 
10732   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10733                              Fma2, Fma1, Mul, Fma2, Flags);
10734 
10735   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10736                              NumeratorScaled, Fma3, Flags);
10737 
10738   if (!PreservesDenormals) {
10739     SDNode *DisableDenorm;
10740     if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10741       const SDValue DisableDenormValue = getSPDenormModeValue(
10742           FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10743 
10744       DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10745                                   Fma4.getValue(1), DisableDenormValue,
10746                                   Fma4.getValue(2)).getNode();
10747     } else {
10748       assert(HasDynamicDenormals == (bool)SavedDenormMode);
10749       const SDValue DisableDenormValue =
10750           HasDynamicDenormals
10751               ? SavedDenormMode
10752               : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10753 
10754       DisableDenorm = DAG.getMachineNode(
10755           AMDGPU::S_SETREG_B32, SL, MVT::Other,
10756           {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10757     }
10758 
10759     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10760                                       SDValue(DisableDenorm, 0), DAG.getRoot());
10761     DAG.setRoot(OutputChain);
10762   }
10763 
10764   SDValue Scale = NumeratorScaled.getValue(1);
10765   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10766                              {Fma4, Fma1, Fma3, Scale}, Flags);
10767 
10768   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10769 }
10770 
10771 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10772   if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10773     return FastLowered;
10774 
10775   SDLoc SL(Op);
10776   SDValue X = Op.getOperand(0);
10777   SDValue Y = Op.getOperand(1);
10778 
10779   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10780 
10781   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10782 
10783   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10784 
10785   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10786 
10787   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10788 
10789   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10790 
10791   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10792 
10793   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10794 
10795   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10796 
10797   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10798   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10799 
10800   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10801                              NegDivScale0, Mul, DivScale1);
10802 
10803   SDValue Scale;
10804 
10805   if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10806     // Workaround a hardware bug on SI where the condition output from div_scale
10807     // is not usable.
10808 
10809     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10810 
10811     // Figure out if the scale to use for div_fmas.
10812     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10813     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10814     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10815     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10816 
10817     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10818     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10819 
10820     SDValue Scale0Hi
10821       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10822     SDValue Scale1Hi
10823       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10824 
10825     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10826     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10827     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10828   } else {
10829     Scale = DivScale1.getValue(1);
10830   }
10831 
10832   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10833                              Fma4, Fma3, Mul, Scale);
10834 
10835   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10836 }
10837 
10838 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10839   EVT VT = Op.getValueType();
10840 
10841   if (VT == MVT::f32)
10842     return LowerFDIV32(Op, DAG);
10843 
10844   if (VT == MVT::f64)
10845     return LowerFDIV64(Op, DAG);
10846 
10847   if (VT == MVT::f16)
10848     return LowerFDIV16(Op, DAG);
10849 
10850   llvm_unreachable("Unexpected type for fdiv");
10851 }
10852 
10853 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10854   SDLoc dl(Op);
10855   SDValue Val = Op.getOperand(0);
10856   EVT VT = Val.getValueType();
10857   EVT ResultExpVT = Op->getValueType(1);
10858   EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10859 
10860   SDValue Mant = DAG.getNode(
10861       ISD::INTRINSIC_WO_CHAIN, dl, VT,
10862       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10863 
10864   SDValue Exp = DAG.getNode(
10865       ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10866       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10867 
10868   if (Subtarget->hasFractBug()) {
10869     SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10870     SDValue Inf = DAG.getConstantFP(
10871         APFloat::getInf(SelectionDAG::EVTToAPFloatSemantics(VT)), dl, VT);
10872 
10873     SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10874     SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10875     Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10876     Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10877   }
10878 
10879   SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10880   return DAG.getMergeValues({Mant, CastExp}, dl);
10881 }
10882 
10883 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10884   SDLoc DL(Op);
10885   StoreSDNode *Store = cast<StoreSDNode>(Op);
10886   EVT VT = Store->getMemoryVT();
10887 
10888   if (VT == MVT::i1) {
10889     return DAG.getTruncStore(Store->getChain(), DL,
10890        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10891        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10892   }
10893 
10894   assert(VT.isVector() &&
10895          Store->getValue().getValueType().getScalarType() == MVT::i32);
10896 
10897   unsigned AS = Store->getAddressSpace();
10898   if (Subtarget->hasLDSMisalignedBug() &&
10899       AS == AMDGPUAS::FLAT_ADDRESS &&
10900       Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10901     return SplitVectorStore(Op, DAG);
10902   }
10903 
10904   MachineFunction &MF = DAG.getMachineFunction();
10905   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10906   // If there is a possibility that flat instruction access scratch memory
10907   // then we need to use the same legalization rules we use for private.
10908   if (AS == AMDGPUAS::FLAT_ADDRESS &&
10909       !Subtarget->hasMultiDwordFlatScratchAddressing())
10910     AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10911          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10912 
10913   unsigned NumElements = VT.getVectorNumElements();
10914   if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10915       AS == AMDGPUAS::FLAT_ADDRESS) {
10916     if (NumElements > 4)
10917       return SplitVectorStore(Op, DAG);
10918     // v3 stores not supported on SI.
10919     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10920       return SplitVectorStore(Op, DAG);
10921 
10922     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
10923                                         VT, *Store->getMemOperand()))
10924       return expandUnalignedStore(Store, DAG);
10925 
10926     return SDValue();
10927   }
10928   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10929     switch (Subtarget->getMaxPrivateElementSize()) {
10930     case 4:
10931       return scalarizeVectorStore(Store, DAG);
10932     case 8:
10933       if (NumElements > 2)
10934         return SplitVectorStore(Op, DAG);
10935       return SDValue();
10936     case 16:
10937       if (NumElements > 4 ||
10938           (NumElements == 3 && !Subtarget->enableFlatScratch()))
10939         return SplitVectorStore(Op, DAG);
10940       return SDValue();
10941     default:
10942       llvm_unreachable("unsupported private_element_size");
10943     }
10944   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10945     unsigned Fast = 0;
10946     auto Flags = Store->getMemOperand()->getFlags();
10947     if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
10948                                            Store->getAlign(), Flags, &Fast) &&
10949         Fast > 1)
10950       return SDValue();
10951 
10952     if (VT.isVector())
10953       return SplitVectorStore(Op, DAG);
10954 
10955     return expandUnalignedStore(Store, DAG);
10956   }
10957 
10958   // Probably an invalid store. If so we'll end up emitting a selection error.
10959   return SDValue();
10960 }
10961 
10962 // Avoid the full correct expansion for f32 sqrt when promoting from f16.
10963 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10964   SDLoc SL(Op);
10965   assert(!Subtarget->has16BitInsts());
10966   SDNodeFlags Flags = Op->getFlags();
10967   SDValue Ext =
10968       DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10969 
10970   SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10971   SDValue Sqrt =
10972       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10973 
10974   return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10975                      DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10976 }
10977 
10978 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10979   SDLoc DL(Op);
10980   SDNodeFlags Flags = Op->getFlags();
10981   MVT VT = Op.getValueType().getSimpleVT();
10982   const SDValue X = Op.getOperand(0);
10983 
10984   if (allowApproxFunc(DAG, Flags)) {
10985     // Instruction is 1ulp but ignores denormals.
10986     return DAG.getNode(
10987         ISD::INTRINSIC_WO_CHAIN, DL, VT,
10988         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10989   }
10990 
10991   SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10992   SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10993 
10994   SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10995 
10996   SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10997 
10998   SDValue SqrtX =
10999       DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11000 
11001   SDValue SqrtS;
11002   if (needsDenormHandlingF32(DAG, X, Flags)) {
11003     SDValue SqrtID =
11004         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11005     SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11006 
11007     SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11008     SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11009                                            DAG.getConstant(-1, DL, MVT::i32));
11010     SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11011 
11012     SDValue NegSqrtSNextDown =
11013         DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11014 
11015     SDValue SqrtVP =
11016         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11017 
11018     SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11019                                          DAG.getConstant(1, DL, MVT::i32));
11020     SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11021 
11022     SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11023     SDValue SqrtVS =
11024         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11025 
11026     SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11027     SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11028 
11029     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11030                         Flags);
11031 
11032     SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11033     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11034                         Flags);
11035   } else {
11036     SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11037 
11038     SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11039 
11040     SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11041     SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11042     SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11043 
11044     SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11045     SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11046     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11047 
11048     SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11049     SDValue SqrtD =
11050         DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11051     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11052   }
11053 
11054   SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11055 
11056   SDValue ScaledDown =
11057       DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11058 
11059   SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11060   SDValue IsZeroOrInf =
11061       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11062                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11063 
11064   return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11065 }
11066 
11067 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11068   // For double type, the SQRT and RSQ instructions don't have required
11069   // precision, we apply Goldschmidt's algorithm to improve the result:
11070   //
11071   //   y0 = rsq(x)
11072   //   g0 = x * y0
11073   //   h0 = 0.5 * y0
11074   //
11075   //   r0 = 0.5 - h0 * g0
11076   //   g1 = g0 * r0 + g0
11077   //   h1 = h0 * r0 + h0
11078   //
11079   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11080   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
11081   //   h2 = h1 * r1 + h1
11082   //
11083   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11084   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
11085   //
11086   //   sqrt(x) = g3
11087 
11088   SDNodeFlags Flags = Op->getFlags();
11089 
11090   SDLoc DL(Op);
11091 
11092   SDValue X = Op.getOperand(0);
11093   SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11094 
11095   SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11096 
11097   SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11098 
11099   // Scale up input if it is too small.
11100   SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11101   SDValue ScaleUp =
11102       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11103   SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11104 
11105   SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11106 
11107   SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11108 
11109   SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11110   SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11111 
11112   SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11113   SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11114 
11115   SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11116 
11117   SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11118 
11119   SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11120   SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11121 
11122   SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11123 
11124   SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11125   SDValue SqrtD1 =
11126       DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11127 
11128   SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11129 
11130   SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11131   SDValue ScaleDown =
11132       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11133   SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11134 
11135   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11136   // with finite only or nsz because rsq(+/-0) = +/-inf
11137 
11138   // TODO: Check for DAZ and expand to subnormals
11139   SDValue IsZeroOrInf =
11140       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11141                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11142 
11143   // If x is +INF, +0, or -0, use its original value
11144   return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11145                      Flags);
11146 }
11147 
11148 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11149   SDLoc DL(Op);
11150   EVT VT = Op.getValueType();
11151   SDValue Arg = Op.getOperand(0);
11152   SDValue TrigVal;
11153 
11154   // Propagate fast-math flags so that the multiply we introduce can be folded
11155   // if Arg is already the result of a multiply by constant.
11156   auto Flags = Op->getFlags();
11157 
11158   SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11159 
11160   if (Subtarget->hasTrigReducedRange()) {
11161     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11162     TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11163   } else {
11164     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11165   }
11166 
11167   switch (Op.getOpcode()) {
11168   case ISD::FCOS:
11169     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11170   case ISD::FSIN:
11171     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11172   default:
11173     llvm_unreachable("Wrong trig opcode");
11174   }
11175 }
11176 
11177 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11178   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11179   assert(AtomicNode->isCompareAndSwap());
11180   unsigned AS = AtomicNode->getAddressSpace();
11181 
11182   // No custom lowering required for local address space
11183   if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11184     return Op;
11185 
11186   // Non-local address space requires custom lowering for atomic compare
11187   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11188   SDLoc DL(Op);
11189   SDValue ChainIn = Op.getOperand(0);
11190   SDValue Addr = Op.getOperand(1);
11191   SDValue Old = Op.getOperand(2);
11192   SDValue New = Op.getOperand(3);
11193   EVT VT = Op.getValueType();
11194   MVT SimpleVT = VT.getSimpleVT();
11195   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11196 
11197   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11198   SDValue Ops[] = { ChainIn, Addr, NewOld };
11199 
11200   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11201                                  Ops, VT, AtomicNode->getMemOperand());
11202 }
11203 
11204 //===----------------------------------------------------------------------===//
11205 // Custom DAG optimizations
11206 //===----------------------------------------------------------------------===//
11207 
11208 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11209                                                      DAGCombinerInfo &DCI) const {
11210   EVT VT = N->getValueType(0);
11211   EVT ScalarVT = VT.getScalarType();
11212   if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11213     return SDValue();
11214 
11215   SelectionDAG &DAG = DCI.DAG;
11216   SDLoc DL(N);
11217 
11218   SDValue Src = N->getOperand(0);
11219   EVT SrcVT = Src.getValueType();
11220 
11221   // TODO: We could try to match extracting the higher bytes, which would be
11222   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11223   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11224   // about in practice.
11225   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11226     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11227       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11228       DCI.AddToWorklist(Cvt.getNode());
11229 
11230       // For the f16 case, fold to a cast to f32 and then cast back to f16.
11231       if (ScalarVT != MVT::f32) {
11232         Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11233                           DAG.getTargetConstant(0, DL, MVT::i32));
11234       }
11235       return Cvt;
11236     }
11237   }
11238 
11239   return SDValue();
11240 }
11241 
11242 SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11243                                                   DAGCombinerInfo &DCI) const {
11244   SDValue MagnitudeOp = N->getOperand(0);
11245   SDValue SignOp = N->getOperand(1);
11246   SelectionDAG &DAG = DCI.DAG;
11247   SDLoc DL(N);
11248 
11249   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11250   // lower half with a copy.
11251   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11252   if (MagnitudeOp.getValueType() == MVT::f64) {
11253     SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11254     SDValue MagLo =
11255       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11256                   DAG.getConstant(0, DL, MVT::i32));
11257     SDValue MagHi =
11258       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11259                   DAG.getConstant(1, DL, MVT::i32));
11260 
11261     SDValue HiOp =
11262       DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11263 
11264     SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11265 
11266     return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11267   }
11268 
11269   if (SignOp.getValueType() != MVT::f64)
11270     return SDValue();
11271 
11272   // Reduce width of sign operand, we only need the highest bit.
11273   //
11274   // fcopysign f64:x, f64:y ->
11275   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11276   // TODO: In some cases it might make sense to go all the way to f16.
11277   SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11278   SDValue SignAsF32 =
11279       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11280                   DAG.getConstant(1, DL, MVT::i32));
11281 
11282   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11283                      SignAsF32);
11284 }
11285 
11286 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11288 // bits
11289 
11290 // This is a variant of
11291 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11292 //
11293 // The normal DAG combiner will do this, but only if the add has one use since
11294 // that would increase the number of instructions.
11295 //
11296 // This prevents us from seeing a constant offset that can be folded into a
11297 // memory instruction's addressing mode. If we know the resulting add offset of
11298 // a pointer can be folded into an addressing offset, we can replace the pointer
11299 // operand with the add of new constant offset. This eliminates one of the uses,
11300 // and may allow the remaining use to also be simplified.
11301 //
11302 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11303                                                unsigned AddrSpace,
11304                                                EVT MemVT,
11305                                                DAGCombinerInfo &DCI) const {
11306   SDValue N0 = N->getOperand(0);
11307   SDValue N1 = N->getOperand(1);
11308 
11309   // We only do this to handle cases where it's profitable when there are
11310   // multiple uses of the add, so defer to the standard combine.
11311   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11312       N0->hasOneUse())
11313     return SDValue();
11314 
11315   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11316   if (!CN1)
11317     return SDValue();
11318 
11319   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11320   if (!CAdd)
11321     return SDValue();
11322 
11323   SelectionDAG &DAG = DCI.DAG;
11324 
11325   if (N0->getOpcode() == ISD::OR &&
11326       !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11327     return SDValue();
11328 
11329   // If the resulting offset is too large, we can't fold it into the
11330   // addressing mode offset.
11331   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11332   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11333 
11334   AddrMode AM;
11335   AM.HasBaseReg = true;
11336   AM.BaseOffs = Offset.getSExtValue();
11337   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11338     return SDValue();
11339 
11340   SDLoc SL(N);
11341   EVT VT = N->getValueType(0);
11342 
11343   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11344   SDValue COffset = DAG.getConstant(Offset, SL, VT);
11345 
11346   SDNodeFlags Flags;
11347   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11348                           (N0.getOpcode() == ISD::OR ||
11349                            N0->getFlags().hasNoUnsignedWrap()));
11350 
11351   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11352 }
11353 
11354 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11355 /// by the chain and intrinsic ID. Theoretically we would also need to check the
11356 /// specific intrinsic, but they all place the pointer operand first.
11357 static unsigned getBasePtrIndex(const MemSDNode *N) {
11358   switch (N->getOpcode()) {
11359   case ISD::STORE:
11360   case ISD::INTRINSIC_W_CHAIN:
11361   case ISD::INTRINSIC_VOID:
11362     return 2;
11363   default:
11364     return 1;
11365   }
11366 }
11367 
11368 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11369                                                   DAGCombinerInfo &DCI) const {
11370   SelectionDAG &DAG = DCI.DAG;
11371   SDLoc SL(N);
11372 
11373   unsigned PtrIdx = getBasePtrIndex(N);
11374   SDValue Ptr = N->getOperand(PtrIdx);
11375 
11376   // TODO: We could also do this for multiplies.
11377   if (Ptr.getOpcode() == ISD::SHL) {
11378     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
11379                                           N->getMemoryVT(), DCI);
11380     if (NewPtr) {
11381       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11382 
11383       NewOps[PtrIdx] = NewPtr;
11384       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11385     }
11386   }
11387 
11388   return SDValue();
11389 }
11390 
11391 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11392   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11393          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11394          (Opc == ISD::XOR && Val == 0);
11395 }
11396 
11397 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399 // integer combine opportunities since most 64-bit operations are decomposed
11400 // this way.  TODO: We won't want this for SALU especially if it is an inline
11401 // immediate.
11402 SDValue SITargetLowering::splitBinaryBitConstantOp(
11403   DAGCombinerInfo &DCI,
11404   const SDLoc &SL,
11405   unsigned Opc, SDValue LHS,
11406   const ConstantSDNode *CRHS) const {
11407   uint64_t Val = CRHS->getZExtValue();
11408   uint32_t ValLo = Lo_32(Val);
11409   uint32_t ValHi = Hi_32(Val);
11410   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11411 
11412     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11413          bitOpWithConstantIsReducible(Opc, ValHi)) ||
11414         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11415     // If we need to materialize a 64-bit immediate, it will be split up later
11416     // anyway. Avoid creating the harder to understand 64-bit immediate
11417     // materialization.
11418     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11419   }
11420 
11421   return SDValue();
11422 }
11423 
11424 bool llvm::isBoolSGPR(SDValue V) {
11425   if (V.getValueType() != MVT::i1)
11426     return false;
11427   switch (V.getOpcode()) {
11428   default:
11429     break;
11430   case ISD::SETCC:
11431   case AMDGPUISD::FP_CLASS:
11432     return true;
11433   case ISD::AND:
11434   case ISD::OR:
11435   case ISD::XOR:
11436     return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11437   }
11438   return false;
11439 }
11440 
11441 // If a constant has all zeroes or all ones within each byte return it.
11442 // Otherwise return 0.
11443 static uint32_t getConstantPermuteMask(uint32_t C) {
11444   // 0xff for any zero byte in the mask
11445   uint32_t ZeroByteMask = 0;
11446   if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11447   if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11448   if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11449   if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11450   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11451   if ((NonZeroByteMask & C) != NonZeroByteMask)
11452     return 0; // Partial bytes selected.
11453   return C;
11454 }
11455 
11456 // Check if a node selects whole bytes from its operand 0 starting at a byte
11457 // boundary while masking the rest. Returns select mask as in the v_perm_b32
11458 // or -1 if not succeeded.
11459 // Note byte select encoding:
11460 // value 0-3 selects corresponding source byte;
11461 // value 0xc selects zero;
11462 // value 0xff selects 0xff.
11463 static uint32_t getPermuteMask(SDValue V) {
11464   assert(V.getValueSizeInBits() == 32);
11465 
11466   if (V.getNumOperands() != 2)
11467     return ~0;
11468 
11469   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11470   if (!N1)
11471     return ~0;
11472 
11473   uint32_t C = N1->getZExtValue();
11474 
11475   switch (V.getOpcode()) {
11476   default:
11477     break;
11478   case ISD::AND:
11479     if (uint32_t ConstMask = getConstantPermuteMask(C))
11480       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11481     break;
11482 
11483   case ISD::OR:
11484     if (uint32_t ConstMask = getConstantPermuteMask(C))
11485       return (0x03020100 & ~ConstMask) | ConstMask;
11486     break;
11487 
11488   case ISD::SHL:
11489     if (C % 8)
11490       return ~0;
11491 
11492     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11493 
11494   case ISD::SRL:
11495     if (C % 8)
11496       return ~0;
11497 
11498     return uint32_t(0x0c0c0c0c03020100ull >> C);
11499   }
11500 
11501   return ~0;
11502 }
11503 
11504 SDValue SITargetLowering::performAndCombine(SDNode *N,
11505                                             DAGCombinerInfo &DCI) const {
11506   if (DCI.isBeforeLegalize())
11507     return SDValue();
11508 
11509   SelectionDAG &DAG = DCI.DAG;
11510   EVT VT = N->getValueType(0);
11511   SDValue LHS = N->getOperand(0);
11512   SDValue RHS = N->getOperand(1);
11513 
11514 
11515   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11516   if (VT == MVT::i64 && CRHS) {
11517     if (SDValue Split
11518         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11519       return Split;
11520   }
11521 
11522   if (CRHS && VT == MVT::i32) {
11523     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11524     // nb = number of trailing zeroes in mask
11525     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11526     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11527     uint64_t Mask = CRHS->getZExtValue();
11528     unsigned Bits = llvm::popcount(Mask);
11529     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11530         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11531       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11532         unsigned Shift = CShift->getZExtValue();
11533         unsigned NB = CRHS->getAPIntValue().countr_zero();
11534         unsigned Offset = NB + Shift;
11535         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11536           SDLoc SL(N);
11537           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11538                                     LHS->getOperand(0),
11539                                     DAG.getConstant(Offset, SL, MVT::i32),
11540                                     DAG.getConstant(Bits, SL, MVT::i32));
11541           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11542           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11543                                     DAG.getValueType(NarrowVT));
11544           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11545                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11546           return Shl;
11547         }
11548       }
11549     }
11550 
11551     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11552     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11553         isa<ConstantSDNode>(LHS.getOperand(2))) {
11554       uint32_t Sel = getConstantPermuteMask(Mask);
11555       if (!Sel)
11556         return SDValue();
11557 
11558       // Select 0xc for all zero bytes
11559       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11560       SDLoc DL(N);
11561       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11562                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11563     }
11564   }
11565 
11566   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11567   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11568   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11569     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11570     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11571 
11572     SDValue X = LHS.getOperand(0);
11573     SDValue Y = RHS.getOperand(0);
11574     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11575         !isTypeLegal(X.getValueType()))
11576       return SDValue();
11577 
11578     if (LCC == ISD::SETO) {
11579       if (X != LHS.getOperand(1))
11580         return SDValue();
11581 
11582       if (RCC == ISD::SETUNE) {
11583         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11584         if (!C1 || !C1->isInfinity() || C1->isNegative())
11585           return SDValue();
11586 
11587         const uint32_t Mask = SIInstrFlags::N_NORMAL |
11588                               SIInstrFlags::N_SUBNORMAL |
11589                               SIInstrFlags::N_ZERO |
11590                               SIInstrFlags::P_ZERO |
11591                               SIInstrFlags::P_SUBNORMAL |
11592                               SIInstrFlags::P_NORMAL;
11593 
11594         static_assert(((~(SIInstrFlags::S_NAN |
11595                           SIInstrFlags::Q_NAN |
11596                           SIInstrFlags::N_INFINITY |
11597                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11598                       "mask not equal");
11599 
11600         SDLoc DL(N);
11601         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11602                            X, DAG.getConstant(Mask, DL, MVT::i32));
11603       }
11604     }
11605   }
11606 
11607   if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11608     std::swap(LHS, RHS);
11609 
11610   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11611       RHS.hasOneUse()) {
11612     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11613     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11614     // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11615     const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11616     if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11617         (RHS.getOperand(0) == LHS.getOperand(0) &&
11618          LHS.getOperand(0) == LHS.getOperand(1))) {
11619       const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11620       unsigned NewMask = LCC == ISD::SETO ?
11621         Mask->getZExtValue() & ~OrdMask :
11622         Mask->getZExtValue() & OrdMask;
11623 
11624       SDLoc DL(N);
11625       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11626                          DAG.getConstant(NewMask, DL, MVT::i32));
11627     }
11628   }
11629 
11630   if (VT == MVT::i32 &&
11631       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11632     // and x, (sext cc from i1) => select cc, x, 0
11633     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11634       std::swap(LHS, RHS);
11635     if (isBoolSGPR(RHS.getOperand(0)))
11636       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11637                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11638   }
11639 
11640   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11641   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11642   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11643       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11644     uint32_t LHSMask = getPermuteMask(LHS);
11645     uint32_t RHSMask = getPermuteMask(RHS);
11646     if (LHSMask != ~0u && RHSMask != ~0u) {
11647       // Canonicalize the expression in an attempt to have fewer unique masks
11648       // and therefore fewer registers used to hold the masks.
11649       if (LHSMask > RHSMask) {
11650         std::swap(LHSMask, RHSMask);
11651         std::swap(LHS, RHS);
11652       }
11653 
11654       // Select 0xc for each lane used from source operand. Zero has 0xc mask
11655       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11656       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11657       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11658 
11659       // Check of we need to combine values from two sources within a byte.
11660       if (!(LHSUsedLanes & RHSUsedLanes) &&
11661           // If we select high and lower word keep it for SDWA.
11662           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11663           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11664         // Each byte in each mask is either selector mask 0-3, or has higher
11665         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11666         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11667         // mask which is not 0xff wins. By anding both masks we have a correct
11668         // result except that 0x0c shall be corrected to give 0x0c only.
11669         uint32_t Mask = LHSMask & RHSMask;
11670         for (unsigned I = 0; I < 32; I += 8) {
11671           uint32_t ByteSel = 0xff << I;
11672           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11673             Mask &= (0x0c << I) & 0xffffffff;
11674         }
11675 
11676         // Add 4 to each active LHS lane. It will not affect any existing 0xff
11677         // or 0x0c.
11678         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11679         SDLoc DL(N);
11680 
11681         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11682                            LHS.getOperand(0), RHS.getOperand(0),
11683                            DAG.getConstant(Sel, DL, MVT::i32));
11684       }
11685     }
11686   }
11687 
11688   return SDValue();
11689 }
11690 
11691 // A key component of v_perm is a mapping between byte position of the src
11692 // operands, and the byte position of the dest. To provide such, we need: 1. the
11693 // node that provides x byte of the dest of the OR, and 2. the byte of the node
11694 // used to provide that x byte. calculateByteProvider finds which node provides
11695 // a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11696 // and finds an ultimate src and byte position For example: The supported
11697 // LoadCombine pattern for vector loads is as follows
11698 //                                t1
11699 //                                or
11700 //                      /                  \
11701 //                      t2                 t3
11702 //                     zext                shl
11703 //                      |                   |     \
11704 //                     t4                  t5     16
11705 //                     or                 anyext
11706 //                 /        \               |
11707 //                t6        t7             t8
11708 //               srl        shl             or
11709 //            /    |      /     \         /     \
11710 //           t9   t10    t11   t12      t13    t14
11711 //         trunc*  8    trunc*  8      and     and
11712 //           |            |          /    |     |    \
11713 //          t15          t16        t17  t18   t19   t20
11714 //                                trunc*  255   srl   -256
11715 //                                   |         /   \
11716 //                                  t15       t15  16
11717 //
11718 // *In this example, the truncs are from i32->i16
11719 //
11720 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721 // respectively. calculateSrcByte would find (given node) -> ultimate src &
11722 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11723 // After finding the mapping, we can combine the tree into vperm t15, t16,
11724 // 0x05000407
11725 
11726 // Find the source and byte position from a node.
11727 // \p DestByte is the byte position of the dest of the or that the src
11728 // ultimately provides. \p SrcIndex is the byte of the src that maps to this
11729 // dest of the or byte. \p Depth tracks how many recursive iterations we have
11730 // performed.
11731 static const std::optional<ByteProvider<SDValue>>
11732 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11733                  unsigned Depth = 0) {
11734   // We may need to recursively traverse a series of SRLs
11735   if (Depth >= 6)
11736     return std::nullopt;
11737 
11738   if (Op.getValueSizeInBits() < 8)
11739     return std::nullopt;
11740 
11741   if (Op.getValueType().isVector())
11742     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11743 
11744   switch (Op->getOpcode()) {
11745   case ISD::TRUNCATE: {
11746     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11747   }
11748 
11749   case ISD::SIGN_EXTEND:
11750   case ISD::ZERO_EXTEND:
11751   case ISD::SIGN_EXTEND_INREG: {
11752     SDValue NarrowOp = Op->getOperand(0);
11753     auto NarrowVT = NarrowOp.getValueType();
11754     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11755       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11756       NarrowVT = VTSign->getVT();
11757     }
11758     if (!NarrowVT.isByteSized())
11759       return std::nullopt;
11760     uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11761 
11762     if (SrcIndex >= NarrowByteWidth)
11763       return std::nullopt;
11764     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11765   }
11766 
11767   case ISD::SRA:
11768   case ISD::SRL: {
11769     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11770     if (!ShiftOp)
11771       return std::nullopt;
11772 
11773     uint64_t BitShift = ShiftOp->getZExtValue();
11774 
11775     if (BitShift % 8 != 0)
11776       return std::nullopt;
11777 
11778     SrcIndex += BitShift / 8;
11779 
11780     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11781   }
11782 
11783   default: {
11784     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11785   }
11786   }
11787   llvm_unreachable("fully handled switch");
11788 }
11789 
11790 // For a byte position in the result of an Or, traverse the tree and find the
11791 // node (and the byte of the node) which ultimately provides this {Or,
11792 // BytePosition}. \p Op is the operand we are currently examining. \p Index is
11793 // the byte position of the Op that corresponds with the originally requested
11794 // byte of the Or \p Depth tracks how many recursive iterations we have
11795 // performed. \p StartingIndex is the originally requested byte of the Or
11796 static const std::optional<ByteProvider<SDValue>>
11797 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11798                       unsigned StartingIndex = 0) {
11799   // Finding Src tree of RHS of or typically requires at least 1 additional
11800   // depth
11801   if (Depth > 6)
11802     return std::nullopt;
11803 
11804   unsigned BitWidth = Op.getScalarValueSizeInBits();
11805   if (BitWidth % 8 != 0)
11806     return std::nullopt;
11807   if (Index > BitWidth / 8 - 1)
11808     return std::nullopt;
11809 
11810   bool IsVec = Op.getValueType().isVector();
11811   switch (Op.getOpcode()) {
11812   case ISD::OR: {
11813     if (IsVec)
11814       return std::nullopt;
11815 
11816     auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11817                                      StartingIndex);
11818     if (!RHS)
11819       return std::nullopt;
11820     auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11821                                      StartingIndex);
11822     if (!LHS)
11823       return std::nullopt;
11824     // A well formed Or will have two ByteProviders for each byte, one of which
11825     // is constant zero
11826     if (!LHS->isConstantZero() && !RHS->isConstantZero())
11827       return std::nullopt;
11828     if (!LHS || LHS->isConstantZero())
11829       return RHS;
11830     if (!RHS || RHS->isConstantZero())
11831       return LHS;
11832     return std::nullopt;
11833   }
11834 
11835   case ISD::AND: {
11836     if (IsVec)
11837       return std::nullopt;
11838 
11839     auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11840     if (!BitMaskOp)
11841       return std::nullopt;
11842 
11843     uint32_t BitMask = BitMaskOp->getZExtValue();
11844     // Bits we expect for our StartingIndex
11845     uint32_t IndexMask = 0xFF << (Index * 8);
11846 
11847     if ((IndexMask & BitMask) != IndexMask) {
11848       // If the result of the and partially provides the byte, then it
11849       // is not well formatted
11850       if (IndexMask & BitMask)
11851         return std::nullopt;
11852       return ByteProvider<SDValue>::getConstantZero();
11853     }
11854 
11855     return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11856   }
11857 
11858   case ISD::FSHR: {
11859     if (IsVec)
11860       return std::nullopt;
11861 
11862     // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11863     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11864     if (!ShiftOp || Op.getValueType().isVector())
11865       return std::nullopt;
11866 
11867     uint64_t BitsProvided = Op.getValueSizeInBits();
11868     if (BitsProvided % 8 != 0)
11869       return std::nullopt;
11870 
11871     uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11872     if (BitShift % 8)
11873       return std::nullopt;
11874 
11875     uint64_t ConcatSizeInBytes = BitsProvided / 4;
11876     uint64_t ByteShift = BitShift / 8;
11877 
11878     uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11879     uint64_t BytesProvided = BitsProvided / 8;
11880     SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11881     NewIndex %= BytesProvided;
11882     return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11883   }
11884 
11885   case ISD::SRA:
11886   case ISD::SRL: {
11887     if (IsVec)
11888       return std::nullopt;
11889 
11890     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11891     if (!ShiftOp)
11892       return std::nullopt;
11893 
11894     uint64_t BitShift = ShiftOp->getZExtValue();
11895     if (BitShift % 8)
11896       return std::nullopt;
11897 
11898     auto BitsProvided = Op.getScalarValueSizeInBits();
11899     if (BitsProvided % 8 != 0)
11900       return std::nullopt;
11901 
11902     uint64_t BytesProvided = BitsProvided / 8;
11903     uint64_t ByteShift = BitShift / 8;
11904     // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11905     // If the byte we are trying to provide (as tracked by index) falls in this
11906     // range, then the SRL provides the byte. The byte of interest of the src of
11907     // the SRL is Index + ByteShift
11908     return BytesProvided - ByteShift > Index
11909                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11910                                   Index + ByteShift)
11911                : ByteProvider<SDValue>::getConstantZero();
11912   }
11913 
11914   case ISD::SHL: {
11915     if (IsVec)
11916       return std::nullopt;
11917 
11918     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11919     if (!ShiftOp)
11920       return std::nullopt;
11921 
11922     uint64_t BitShift = ShiftOp->getZExtValue();
11923     if (BitShift % 8 != 0)
11924       return std::nullopt;
11925     uint64_t ByteShift = BitShift / 8;
11926 
11927     // If we are shifting by an amount greater than (or equal to)
11928     // the index we are trying to provide, then it provides 0s. If not,
11929     // then this bytes are not definitively 0s, and the corresponding byte
11930     // of interest is Index - ByteShift of the src
11931     return Index < ByteShift
11932                ? ByteProvider<SDValue>::getConstantZero()
11933                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11934                                        Depth + 1, StartingIndex);
11935   }
11936   case ISD::ANY_EXTEND:
11937   case ISD::SIGN_EXTEND:
11938   case ISD::ZERO_EXTEND:
11939   case ISD::SIGN_EXTEND_INREG:
11940   case ISD::AssertZext:
11941   case ISD::AssertSext: {
11942     if (IsVec)
11943       return std::nullopt;
11944 
11945     SDValue NarrowOp = Op->getOperand(0);
11946     unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11947     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11948         Op->getOpcode() == ISD::AssertZext ||
11949         Op->getOpcode() == ISD::AssertSext) {
11950       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11951       NarrowBitWidth = VTSign->getVT().getSizeInBits();
11952     }
11953     if (NarrowBitWidth % 8 != 0)
11954       return std::nullopt;
11955     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11956 
11957     if (Index >= NarrowByteWidth)
11958       return Op.getOpcode() == ISD::ZERO_EXTEND
11959                  ? std::optional<ByteProvider<SDValue>>(
11960                        ByteProvider<SDValue>::getConstantZero())
11961                  : std::nullopt;
11962     return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11963   }
11964 
11965   case ISD::TRUNCATE: {
11966     if (IsVec)
11967       return std::nullopt;
11968 
11969     uint64_t NarrowByteWidth = BitWidth / 8;
11970 
11971     if (NarrowByteWidth >= Index) {
11972       return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11973                                    StartingIndex);
11974     }
11975 
11976     return std::nullopt;
11977   }
11978 
11979   case ISD::CopyFromReg: {
11980     if (BitWidth / 8 > Index)
11981       return calculateSrcByte(Op, StartingIndex, Index);
11982 
11983     return std::nullopt;
11984   }
11985 
11986   case ISD::LOAD: {
11987     auto L = cast<LoadSDNode>(Op.getNode());
11988 
11989     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11990     if (NarrowBitWidth % 8 != 0)
11991       return std::nullopt;
11992     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11993 
11994     // If the width of the load does not reach byte we are trying to provide for
11995     // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11996     // question
11997     if (Index >= NarrowByteWidth) {
11998       return L->getExtensionType() == ISD::ZEXTLOAD
11999                  ? std::optional<ByteProvider<SDValue>>(
12000                        ByteProvider<SDValue>::getConstantZero())
12001                  : std::nullopt;
12002     }
12003 
12004     if (NarrowByteWidth > Index) {
12005       return calculateSrcByte(Op, StartingIndex, Index);
12006     }
12007 
12008     return std::nullopt;
12009   }
12010 
12011   case ISD::BSWAP: {
12012     if (IsVec)
12013       return std::nullopt;
12014 
12015     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12016                                  Depth + 1, StartingIndex);
12017   }
12018 
12019   case ISD::EXTRACT_VECTOR_ELT: {
12020     auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12021     if (!IdxOp)
12022       return std::nullopt;
12023     auto VecIdx = IdxOp->getZExtValue();
12024     auto ScalarSize = Op.getScalarValueSizeInBits();
12025     if (ScalarSize < 32)
12026       Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12027     return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12028                             StartingIndex, Index);
12029   }
12030 
12031   case AMDGPUISD::PERM: {
12032     if (IsVec)
12033       return std::nullopt;
12034 
12035     auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12036     if (!PermMask)
12037       return std::nullopt;
12038 
12039     auto IdxMask =
12040         (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12041     if (IdxMask > 0x07 && IdxMask != 0x0c)
12042       return std::nullopt;
12043 
12044     auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12045     auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12046 
12047     return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12048                            : ByteProvider<SDValue>(
12049                                  ByteProvider<SDValue>::getConstantZero());
12050   }
12051 
12052   default: {
12053     return std::nullopt;
12054   }
12055   }
12056 
12057   llvm_unreachable("fully handled switch");
12058 }
12059 
12060 // Returns true if the Operand is a scalar and is 16 bits
12061 static bool isExtendedFrom16Bits(SDValue &Operand) {
12062 
12063   switch (Operand.getOpcode()) {
12064   case ISD::ANY_EXTEND:
12065   case ISD::SIGN_EXTEND:
12066   case ISD::ZERO_EXTEND: {
12067     auto OpVT = Operand.getOperand(0).getValueType();
12068     return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12069   }
12070   case ISD::LOAD: {
12071     LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12072     auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12073     if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12074         ExtType == ISD::EXTLOAD) {
12075       auto MemVT = L->getMemoryVT();
12076       return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12077     }
12078     return L->getMemoryVT().getSizeInBits() == 16;
12079   }
12080   default:
12081     return false;
12082   }
12083 }
12084 
12085 // Returns true if the mask matches consecutive bytes, and the first byte
12086 // begins at a power of 2 byte offset from 0th byte
12087 static bool addresses16Bits(int Mask) {
12088   int Low8 = Mask & 0xff;
12089   int Hi8 = (Mask & 0xff00) >> 8;
12090 
12091   assert(Low8 < 8 && Hi8 < 8);
12092   // Are the bytes contiguous in the order of increasing addresses.
12093   bool IsConsecutive = (Hi8 - Low8 == 1);
12094   // Is the first byte at location that is aligned for 16 bit instructions.
12095   // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12096   // In this case, we still need code to extract the 16 bit operand, so it
12097   // is better to use i8 v_perm
12098   bool Is16Aligned = !(Low8 % 2);
12099 
12100   return IsConsecutive && Is16Aligned;
12101 }
12102 
12103 // Do not lower into v_perm if the operands are actually 16 bit
12104 // and the selected bits (based on PermMask) correspond with two
12105 // easily addressable 16 bit operands.
12106 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12107                                 SDValue &OtherOp) {
12108   int Low16 = PermMask & 0xffff;
12109   int Hi16 = (PermMask & 0xffff0000) >> 16;
12110 
12111   auto TempOp = peekThroughBitcasts(Op);
12112   auto TempOtherOp = peekThroughBitcasts(OtherOp);
12113 
12114   auto OpIs16Bit =
12115       TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12116   if (!OpIs16Bit)
12117     return true;
12118 
12119   auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12120                         isExtendedFrom16Bits(TempOtherOp);
12121   if (!OtherOpIs16Bit)
12122     return true;
12123 
12124   // Do we cleanly address both
12125   return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12126 }
12127 
12128 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12129                                   unsigned DWordOffset) {
12130   SDValue Ret;
12131 
12132   auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12133   // ByteProvider must be at least 8 bits
12134   assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12135 
12136   if (TypeSize <= 32)
12137     return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12138 
12139   if (Src.getValueType().isVector()) {
12140     auto ScalarTySize = Src.getScalarValueSizeInBits();
12141     auto ScalarTy = Src.getValueType().getScalarType();
12142     if (ScalarTySize == 32) {
12143       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12144                          DAG.getConstant(DWordOffset, SL, MVT::i32));
12145     }
12146     if (ScalarTySize > 32) {
12147       Ret = DAG.getNode(
12148           ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12149           DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12150       auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12151       if (ShiftVal)
12152         Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12153                           DAG.getConstant(ShiftVal, SL, MVT::i32));
12154       return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12155     }
12156 
12157     assert(ScalarTySize < 32);
12158     auto NumElements = TypeSize / ScalarTySize;
12159     auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12160     auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12161     auto NumElementsIn32 = 32 / ScalarTySize;
12162     auto NumAvailElements = DWordOffset < Trunc32Elements
12163                                 ? NumElementsIn32
12164                                 : NumElements - NormalizedTrunc;
12165 
12166     SmallVector<SDValue, 4> VecSrcs;
12167     DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12168                               NumAvailElements);
12169 
12170     Ret = DAG.getBuildVector(
12171         MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12172         VecSrcs);
12173     return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12174   }
12175 
12176   /// Scalar Type
12177   auto ShiftVal = 32 * DWordOffset;
12178   Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12179                     DAG.getConstant(ShiftVal, SL, MVT::i32));
12180   return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12181 }
12182 
12183 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12184   SelectionDAG &DAG = DCI.DAG;
12185   [[maybe_unused]] EVT VT = N->getValueType(0);
12186   SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12187 
12188   // VT is known to be MVT::i32, so we need to provide 4 bytes.
12189   assert(VT == MVT::i32);
12190   for (int i = 0; i < 4; i++) {
12191     // Find the ByteProvider that provides the ith byte of the result of OR
12192     std::optional<ByteProvider<SDValue>> P =
12193         calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12194     // TODO support constantZero
12195     if (!P || P->isConstantZero())
12196       return SDValue();
12197 
12198     PermNodes.push_back(*P);
12199   }
12200   if (PermNodes.size() != 4)
12201     return SDValue();
12202 
12203   std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12204   std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12205   uint64_t PermMask = 0x00000000;
12206   for (size_t i = 0; i < PermNodes.size(); i++) {
12207     auto PermOp = PermNodes[i];
12208     // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12209     // by sizeof(Src2) = 4
12210     int SrcByteAdjust = 4;
12211 
12212     // If the Src uses a byte from a different DWORD, then it corresponds
12213     // with a difference source
12214     if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12215         ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12216       if (SecondSrc)
12217         if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12218             ((PermOp.SrcOffset / 4) != SecondSrc->second))
12219           return SDValue();
12220 
12221       // Set the index of the second distinct Src node
12222       SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12223       assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12224       SrcByteAdjust = 0;
12225     }
12226     assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12227     assert(!DAG.getDataLayout().isBigEndian());
12228     PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12229   }
12230   SDLoc DL(N);
12231   SDValue Op = *PermNodes[FirstSrc.first].Src;
12232   Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12233   assert(Op.getValueSizeInBits() == 32);
12234 
12235   // Check that we are not just extracting the bytes in order from an op
12236   if (!SecondSrc) {
12237     int Low16 = PermMask & 0xffff;
12238     int Hi16 = (PermMask & 0xffff0000) >> 16;
12239 
12240     bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12241     bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12242 
12243     // The perm op would really just produce Op. So combine into Op
12244     if (WellFormedLow && WellFormedHi)
12245       return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12246   }
12247 
12248   SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12249 
12250   if (SecondSrc) {
12251     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12252     assert(OtherOp.getValueSizeInBits() == 32);
12253   }
12254 
12255   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12256 
12257     assert(Op.getValueType().isByteSized() &&
12258            OtherOp.getValueType().isByteSized());
12259 
12260     // If the ultimate src is less than 32 bits, then we will only be
12261     // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12262     // CalculateByteProvider would not have returned Op as source if we
12263     // used a byte that is outside its ValueType. Thus, we are free to
12264     // ANY_EXTEND as the extended bits are dont-cares.
12265     Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12266     OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12267 
12268     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12269                        DAG.getConstant(PermMask, DL, MVT::i32));
12270   }
12271   return SDValue();
12272 }
12273 
12274 SDValue SITargetLowering::performOrCombine(SDNode *N,
12275                                            DAGCombinerInfo &DCI) const {
12276   SelectionDAG &DAG = DCI.DAG;
12277   SDValue LHS = N->getOperand(0);
12278   SDValue RHS = N->getOperand(1);
12279 
12280   EVT VT = N->getValueType(0);
12281   if (VT == MVT::i1) {
12282     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12283     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12284         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12285       SDValue Src = LHS.getOperand(0);
12286       if (Src != RHS.getOperand(0))
12287         return SDValue();
12288 
12289       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12290       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12291       if (!CLHS || !CRHS)
12292         return SDValue();
12293 
12294       // Only 10 bits are used.
12295       static const uint32_t MaxMask = 0x3ff;
12296 
12297       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12298       SDLoc DL(N);
12299       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12300                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
12301     }
12302 
12303     return SDValue();
12304   }
12305 
12306   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12307   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12308       LHS.getOpcode() == AMDGPUISD::PERM &&
12309       isa<ConstantSDNode>(LHS.getOperand(2))) {
12310     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12311     if (!Sel)
12312       return SDValue();
12313 
12314     Sel |= LHS.getConstantOperandVal(2);
12315     SDLoc DL(N);
12316     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12317                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12318   }
12319 
12320   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12321   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12322   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12323       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12324 
12325     // If all the uses of an or need to extract the individual elements, do not
12326     // attempt to lower into v_perm
12327     auto usesCombinedOperand = [](SDNode *OrUse) {
12328       // If we have any non-vectorized use, then it is a candidate for v_perm
12329       if (OrUse->getOpcode() != ISD::BITCAST ||
12330           !OrUse->getValueType(0).isVector())
12331         return true;
12332 
12333       // If we have any non-vectorized use, then it is a candidate for v_perm
12334       for (auto VUse : OrUse->uses()) {
12335         if (!VUse->getValueType(0).isVector())
12336           return true;
12337 
12338         // If the use of a vector is a store, then combining via a v_perm
12339         // is beneficial.
12340         // TODO -- whitelist more uses
12341         for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12342           if (VUse->getOpcode() == VectorwiseOp)
12343             return true;
12344       }
12345       return false;
12346     };
12347 
12348     if (!any_of(N->uses(), usesCombinedOperand))
12349       return SDValue();
12350 
12351     uint32_t LHSMask = getPermuteMask(LHS);
12352     uint32_t RHSMask = getPermuteMask(RHS);
12353 
12354     if (LHSMask != ~0u && RHSMask != ~0u) {
12355       // Canonicalize the expression in an attempt to have fewer unique masks
12356       // and therefore fewer registers used to hold the masks.
12357       if (LHSMask > RHSMask) {
12358         std::swap(LHSMask, RHSMask);
12359         std::swap(LHS, RHS);
12360       }
12361 
12362       // Select 0xc for each lane used from source operand. Zero has 0xc mask
12363       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12364       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12365       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12366 
12367       // Check of we need to combine values from two sources within a byte.
12368       if (!(LHSUsedLanes & RHSUsedLanes) &&
12369           // If we select high and lower word keep it for SDWA.
12370           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12371           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12372         // Kill zero bytes selected by other mask. Zero value is 0xc.
12373         LHSMask &= ~RHSUsedLanes;
12374         RHSMask &= ~LHSUsedLanes;
12375         // Add 4 to each active LHS lane
12376         LHSMask |= LHSUsedLanes & 0x04040404;
12377         // Combine masks
12378         uint32_t Sel = LHSMask | RHSMask;
12379         SDLoc DL(N);
12380 
12381         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12382                            LHS.getOperand(0), RHS.getOperand(0),
12383                            DAG.getConstant(Sel, DL, MVT::i32));
12384       }
12385     }
12386     if (LHSMask == ~0u || RHSMask == ~0u) {
12387       if (SDValue Perm = matchPERM(N, DCI))
12388         return Perm;
12389     }
12390   }
12391 
12392   if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12393     return SDValue();
12394 
12395   // TODO: This could be a generic combine with a predicate for extracting the
12396   // high half of an integer being free.
12397 
12398   // (or i64:x, (zero_extend i32:y)) ->
12399   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12400   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12401       RHS.getOpcode() != ISD::ZERO_EXTEND)
12402     std::swap(LHS, RHS);
12403 
12404   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12405     SDValue ExtSrc = RHS.getOperand(0);
12406     EVT SrcVT = ExtSrc.getValueType();
12407     if (SrcVT == MVT::i32) {
12408       SDLoc SL(N);
12409       SDValue LowLHS, HiBits;
12410       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12411       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12412 
12413       DCI.AddToWorklist(LowOr.getNode());
12414       DCI.AddToWorklist(HiBits.getNode());
12415 
12416       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12417                                 LowOr, HiBits);
12418       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12419     }
12420   }
12421 
12422   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12423   if (CRHS) {
12424     if (SDValue Split
12425           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12426                                      N->getOperand(0), CRHS))
12427       return Split;
12428   }
12429 
12430   return SDValue();
12431 }
12432 
12433 SDValue SITargetLowering::performXorCombine(SDNode *N,
12434                                             DAGCombinerInfo &DCI) const {
12435   if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12436     return RV;
12437 
12438   SDValue LHS = N->getOperand(0);
12439   SDValue RHS = N->getOperand(1);
12440 
12441   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12442   SelectionDAG &DAG = DCI.DAG;
12443 
12444   EVT VT = N->getValueType(0);
12445   if (CRHS && VT == MVT::i64) {
12446     if (SDValue Split
12447           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12448       return Split;
12449   }
12450 
12451   // Make sure to apply the 64-bit constant splitting fold before trying to fold
12452   // fneg-like xors into 64-bit select.
12453   if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12454     // This looks like an fneg, try to fold as a source modifier.
12455     if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12456         shouldFoldFNegIntoSrc(N, LHS)) {
12457       // xor (select c, a, b), 0x80000000 ->
12458       //   bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12459       SDLoc DL(N);
12460       SDValue CastLHS =
12461           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12462       SDValue CastRHS =
12463           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12464       SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12465       SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12466       SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12467                                       LHS->getOperand(0), FNegLHS, FNegRHS);
12468       return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12469     }
12470   }
12471 
12472   return SDValue();
12473 }
12474 
12475 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12476                                                    DAGCombinerInfo &DCI) const {
12477   if (!Subtarget->has16BitInsts() ||
12478       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12479     return SDValue();
12480 
12481   EVT VT = N->getValueType(0);
12482   if (VT != MVT::i32)
12483     return SDValue();
12484 
12485   SDValue Src = N->getOperand(0);
12486   if (Src.getValueType() != MVT::i16)
12487     return SDValue();
12488 
12489   return SDValue();
12490 }
12491 
12492 SDValue
12493 SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12494                                                 DAGCombinerInfo &DCI) const {
12495   SDValue Src = N->getOperand(0);
12496   auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12497 
12498   // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12499   // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12500   if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12501         VTSign->getVT() == MVT::i8) ||
12502        (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12503         VTSign->getVT() == MVT::i16))) {
12504     assert(Subtarget->hasScalarSubwordLoads() &&
12505            "s_buffer_load_{u8, i8} are supported "
12506            "in GFX12 (or newer) architectures.");
12507     EVT VT = Src.getValueType();
12508     unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12509                        ? AMDGPUISD::SBUFFER_LOAD_BYTE
12510                        : AMDGPUISD::SBUFFER_LOAD_SHORT;
12511     SDLoc DL(N);
12512     SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12513     SDValue Ops[] = {
12514         Src.getOperand(0), // source register
12515         Src.getOperand(1), // offset
12516         Src.getOperand(2)  // cachePolicy
12517     };
12518     auto *M = cast<MemSDNode>(Src);
12519     SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12520         Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12521     SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12522     return LoadVal;
12523   }
12524   if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12525         VTSign->getVT() == MVT::i8) ||
12526        (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12527         VTSign->getVT() == MVT::i16)) &&
12528       Src.hasOneUse()) {
12529     auto *M = cast<MemSDNode>(Src);
12530     SDValue Ops[] = {
12531       Src.getOperand(0), // Chain
12532       Src.getOperand(1), // rsrc
12533       Src.getOperand(2), // vindex
12534       Src.getOperand(3), // voffset
12535       Src.getOperand(4), // soffset
12536       Src.getOperand(5), // offset
12537       Src.getOperand(6),
12538       Src.getOperand(7)
12539     };
12540     // replace with BUFFER_LOAD_BYTE/SHORT
12541     SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12542                                          Src.getOperand(0).getValueType());
12543     unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12544                    AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
12545     SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12546                                                           ResList,
12547                                                           Ops, M->getMemoryVT(),
12548                                                           M->getMemOperand());
12549     return DCI.DAG.getMergeValues({BufferLoadSignExt,
12550                                   BufferLoadSignExt.getValue(1)}, SDLoc(N));
12551   }
12552   return SDValue();
12553 }
12554 
12555 SDValue SITargetLowering::performClassCombine(SDNode *N,
12556                                               DAGCombinerInfo &DCI) const {
12557   SelectionDAG &DAG = DCI.DAG;
12558   SDValue Mask = N->getOperand(1);
12559 
12560   // fp_class x, 0 -> false
12561   if (isNullConstant(Mask))
12562     return DAG.getConstant(0, SDLoc(N), MVT::i1);
12563 
12564   if (N->getOperand(0).isUndef())
12565     return DAG.getUNDEF(MVT::i1);
12566 
12567   return SDValue();
12568 }
12569 
12570 SDValue SITargetLowering::performRcpCombine(SDNode *N,
12571                                             DAGCombinerInfo &DCI) const {
12572   EVT VT = N->getValueType(0);
12573   SDValue N0 = N->getOperand(0);
12574 
12575   if (N0.isUndef()) {
12576     return DCI.DAG.getConstantFP(
12577         APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)), SDLoc(N),
12578         VT);
12579   }
12580 
12581   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12582                          N0.getOpcode() == ISD::SINT_TO_FP)) {
12583     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12584                            N->getFlags());
12585   }
12586 
12587   // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12588   if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12589       N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12590     return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12591                            N0.getOperand(0), N->getFlags());
12592   }
12593 
12594   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
12595 }
12596 
12597 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
12598                                        unsigned MaxDepth) const {
12599   unsigned Opcode = Op.getOpcode();
12600   if (Opcode == ISD::FCANONICALIZE)
12601     return true;
12602 
12603   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12604     const auto &F = CFP->getValueAPF();
12605     if (F.isNaN() && F.isSignaling())
12606       return false;
12607     if (!F.isDenormal())
12608       return true;
12609 
12610     DenormalMode Mode =
12611         DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12612     return Mode == DenormalMode::getIEEE();
12613   }
12614 
12615   // If source is a result of another standard FP operation it is already in
12616   // canonical form.
12617   if (MaxDepth == 0)
12618     return false;
12619 
12620   switch (Opcode) {
12621   // These will flush denorms if required.
12622   case ISD::FADD:
12623   case ISD::FSUB:
12624   case ISD::FMUL:
12625   case ISD::FCEIL:
12626   case ISD::FFLOOR:
12627   case ISD::FMA:
12628   case ISD::FMAD:
12629   case ISD::FSQRT:
12630   case ISD::FDIV:
12631   case ISD::FREM:
12632   case ISD::FP_ROUND:
12633   case ISD::FP_EXTEND:
12634   case ISD::FP16_TO_FP:
12635   case ISD::FP_TO_FP16:
12636   case ISD::BF16_TO_FP:
12637   case ISD::FP_TO_BF16:
12638   case ISD::FLDEXP:
12639   case AMDGPUISD::FMUL_LEGACY:
12640   case AMDGPUISD::FMAD_FTZ:
12641   case AMDGPUISD::RCP:
12642   case AMDGPUISD::RSQ:
12643   case AMDGPUISD::RSQ_CLAMP:
12644   case AMDGPUISD::RCP_LEGACY:
12645   case AMDGPUISD::RCP_IFLAG:
12646   case AMDGPUISD::LOG:
12647   case AMDGPUISD::EXP:
12648   case AMDGPUISD::DIV_SCALE:
12649   case AMDGPUISD::DIV_FMAS:
12650   case AMDGPUISD::DIV_FIXUP:
12651   case AMDGPUISD::FRACT:
12652   case AMDGPUISD::CVT_PKRTZ_F16_F32:
12653   case AMDGPUISD::CVT_F32_UBYTE0:
12654   case AMDGPUISD::CVT_F32_UBYTE1:
12655   case AMDGPUISD::CVT_F32_UBYTE2:
12656   case AMDGPUISD::CVT_F32_UBYTE3:
12657   case AMDGPUISD::FP_TO_FP16:
12658   case AMDGPUISD::SIN_HW:
12659   case AMDGPUISD::COS_HW:
12660     return true;
12661 
12662   // It can/will be lowered or combined as a bit operation.
12663   // Need to check their input recursively to handle.
12664   case ISD::FNEG:
12665   case ISD::FABS:
12666   case ISD::FCOPYSIGN:
12667     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12668 
12669   case ISD::AND:
12670     if (Op.getValueType() == MVT::i32) {
12671       // Be careful as we only know it is a bitcast floating point type. It
12672       // could be f32, v2f16, we have no way of knowing. Luckily the constant
12673       // value that we optimize for, which comes up in fp32 to bf16 conversions,
12674       // is valid to optimize for all types.
12675       if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12676         if (RHS->getZExtValue() == 0xffff0000) {
12677           return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12678         }
12679       }
12680     }
12681     break;
12682 
12683   case ISD::FSIN:
12684   case ISD::FCOS:
12685   case ISD::FSINCOS:
12686     return Op.getValueType().getScalarType() != MVT::f16;
12687 
12688   case ISD::FMINNUM:
12689   case ISD::FMAXNUM:
12690   case ISD::FMINNUM_IEEE:
12691   case ISD::FMAXNUM_IEEE:
12692   case ISD::FMINIMUM:
12693   case ISD::FMAXIMUM:
12694   case AMDGPUISD::CLAMP:
12695   case AMDGPUISD::FMED3:
12696   case AMDGPUISD::FMAX3:
12697   case AMDGPUISD::FMIN3:
12698   case AMDGPUISD::FMAXIMUM3:
12699   case AMDGPUISD::FMINIMUM3: {
12700     // FIXME: Shouldn't treat the generic operations different based these.
12701     // However, we aren't really required to flush the result from
12702     // minnum/maxnum..
12703 
12704     // snans will be quieted, so we only need to worry about denormals.
12705     if (Subtarget->supportsMinMaxDenormModes() ||
12706         // FIXME: denormalsEnabledForType is broken for dynamic
12707         denormalsEnabledForType(DAG, Op.getValueType()))
12708       return true;
12709 
12710     // Flushing may be required.
12711     // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12712     // targets need to check their input recursively.
12713 
12714     // FIXME: Does this apply with clamp? It's implemented with max.
12715     for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12716       if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12717         return false;
12718     }
12719 
12720     return true;
12721   }
12722   case ISD::SELECT: {
12723     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12724            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12725   }
12726   case ISD::BUILD_VECTOR: {
12727     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12728       SDValue SrcOp = Op.getOperand(i);
12729       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12730         return false;
12731     }
12732 
12733     return true;
12734   }
12735   case ISD::EXTRACT_VECTOR_ELT:
12736   case ISD::EXTRACT_SUBVECTOR: {
12737     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12738   }
12739   case ISD::INSERT_VECTOR_ELT: {
12740     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12741            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12742   }
12743   case ISD::UNDEF:
12744     // Could be anything.
12745     return false;
12746 
12747   case ISD::BITCAST:
12748     // TODO: This is incorrect as it loses track of the operand's type. We may
12749     // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12750     // same bits that are canonicalized in one type need not be in the other.
12751     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12752   case ISD::TRUNCATE: {
12753     // Hack round the mess we make when legalizing extract_vector_elt
12754     if (Op.getValueType() == MVT::i16) {
12755       SDValue TruncSrc = Op.getOperand(0);
12756       if (TruncSrc.getValueType() == MVT::i32 &&
12757           TruncSrc.getOpcode() == ISD::BITCAST &&
12758           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12759         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12760       }
12761     }
12762     return false;
12763   }
12764   case ISD::INTRINSIC_WO_CHAIN: {
12765     unsigned IntrinsicID = Op.getConstantOperandVal(0);
12766     // TODO: Handle more intrinsics
12767     switch (IntrinsicID) {
12768     case Intrinsic::amdgcn_cvt_pkrtz:
12769     case Intrinsic::amdgcn_cubeid:
12770     case Intrinsic::amdgcn_frexp_mant:
12771     case Intrinsic::amdgcn_fdot2:
12772     case Intrinsic::amdgcn_rcp:
12773     case Intrinsic::amdgcn_rsq:
12774     case Intrinsic::amdgcn_rsq_clamp:
12775     case Intrinsic::amdgcn_rcp_legacy:
12776     case Intrinsic::amdgcn_rsq_legacy:
12777     case Intrinsic::amdgcn_trig_preop:
12778     case Intrinsic::amdgcn_log:
12779     case Intrinsic::amdgcn_exp2:
12780     case Intrinsic::amdgcn_sqrt:
12781       return true;
12782     default:
12783       break;
12784     }
12785 
12786     break;
12787   }
12788   default:
12789     break;
12790   }
12791 
12792   // FIXME: denormalsEnabledForType is broken for dynamic
12793   return denormalsEnabledForType(DAG, Op.getValueType()) &&
12794          DAG.isKnownNeverSNaN(Op);
12795 }
12796 
12797 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
12798                                        unsigned MaxDepth) const {
12799   const MachineRegisterInfo &MRI = MF.getRegInfo();
12800   MachineInstr *MI = MRI.getVRegDef(Reg);
12801   unsigned Opcode = MI->getOpcode();
12802 
12803   if (Opcode == AMDGPU::G_FCANONICALIZE)
12804     return true;
12805 
12806   std::optional<FPValueAndVReg> FCR;
12807   // Constant splat (can be padded with undef) or scalar constant.
12808   if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12809     if (FCR->Value.isSignaling())
12810       return false;
12811     if (!FCR->Value.isDenormal())
12812       return true;
12813 
12814     DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12815     return Mode == DenormalMode::getIEEE();
12816   }
12817 
12818   if (MaxDepth == 0)
12819     return false;
12820 
12821   switch (Opcode) {
12822   case AMDGPU::G_FADD:
12823   case AMDGPU::G_FSUB:
12824   case AMDGPU::G_FMUL:
12825   case AMDGPU::G_FCEIL:
12826   case AMDGPU::G_FFLOOR:
12827   case AMDGPU::G_FRINT:
12828   case AMDGPU::G_FNEARBYINT:
12829   case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12830   case AMDGPU::G_INTRINSIC_TRUNC:
12831   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12832   case AMDGPU::G_FMA:
12833   case AMDGPU::G_FMAD:
12834   case AMDGPU::G_FSQRT:
12835   case AMDGPU::G_FDIV:
12836   case AMDGPU::G_FREM:
12837   case AMDGPU::G_FPOW:
12838   case AMDGPU::G_FPEXT:
12839   case AMDGPU::G_FLOG:
12840   case AMDGPU::G_FLOG2:
12841   case AMDGPU::G_FLOG10:
12842   case AMDGPU::G_FPTRUNC:
12843   case AMDGPU::G_AMDGPU_RCP_IFLAG:
12844   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12845   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12846   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12847   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12848     return true;
12849   case AMDGPU::G_FNEG:
12850   case AMDGPU::G_FABS:
12851   case AMDGPU::G_FCOPYSIGN:
12852     return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12853   case AMDGPU::G_FMINNUM:
12854   case AMDGPU::G_FMAXNUM:
12855   case AMDGPU::G_FMINNUM_IEEE:
12856   case AMDGPU::G_FMAXNUM_IEEE:
12857   case AMDGPU::G_FMINIMUM:
12858   case AMDGPU::G_FMAXIMUM: {
12859     if (Subtarget->supportsMinMaxDenormModes() ||
12860         // FIXME: denormalsEnabledForType is broken for dynamic
12861         denormalsEnabledForType(MRI.getType(Reg), MF))
12862       return true;
12863 
12864     [[fallthrough]];
12865   }
12866   case AMDGPU::G_BUILD_VECTOR:
12867     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12868       if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12869         return false;
12870     return true;
12871   case AMDGPU::G_INTRINSIC:
12872   case AMDGPU::G_INTRINSIC_CONVERGENT:
12873     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12874     case Intrinsic::amdgcn_fmul_legacy:
12875     case Intrinsic::amdgcn_fmad_ftz:
12876     case Intrinsic::amdgcn_sqrt:
12877     case Intrinsic::amdgcn_fmed3:
12878     case Intrinsic::amdgcn_sin:
12879     case Intrinsic::amdgcn_cos:
12880     case Intrinsic::amdgcn_log:
12881     case Intrinsic::amdgcn_exp2:
12882     case Intrinsic::amdgcn_log_clamp:
12883     case Intrinsic::amdgcn_rcp:
12884     case Intrinsic::amdgcn_rcp_legacy:
12885     case Intrinsic::amdgcn_rsq:
12886     case Intrinsic::amdgcn_rsq_clamp:
12887     case Intrinsic::amdgcn_rsq_legacy:
12888     case Intrinsic::amdgcn_div_scale:
12889     case Intrinsic::amdgcn_div_fmas:
12890     case Intrinsic::amdgcn_div_fixup:
12891     case Intrinsic::amdgcn_fract:
12892     case Intrinsic::amdgcn_cvt_pkrtz:
12893     case Intrinsic::amdgcn_cubeid:
12894     case Intrinsic::amdgcn_cubema:
12895     case Intrinsic::amdgcn_cubesc:
12896     case Intrinsic::amdgcn_cubetc:
12897     case Intrinsic::amdgcn_frexp_mant:
12898     case Intrinsic::amdgcn_fdot2:
12899     case Intrinsic::amdgcn_trig_preop:
12900       return true;
12901     default:
12902       break;
12903     }
12904 
12905     [[fallthrough]];
12906   default:
12907     return false;
12908   }
12909 
12910   llvm_unreachable("invalid operation");
12911 }
12912 
12913 // Constant fold canonicalize.
12914 SDValue SITargetLowering::getCanonicalConstantFP(
12915   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12916   // Flush denormals to 0 if not enabled.
12917   if (C.isDenormal()) {
12918     DenormalMode Mode =
12919         DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12920     if (Mode == DenormalMode::getPreserveSign()) {
12921       return DAG.getConstantFP(
12922           APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12923     }
12924 
12925     if (Mode != DenormalMode::getIEEE())
12926       return SDValue();
12927   }
12928 
12929   if (C.isNaN()) {
12930     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12931     if (C.isSignaling()) {
12932       // Quiet a signaling NaN.
12933       // FIXME: Is this supposed to preserve payload bits?
12934       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12935     }
12936 
12937     // Make sure it is the canonical NaN bitpattern.
12938     //
12939     // TODO: Can we use -1 as the canonical NaN value since it's an inline
12940     // immediate?
12941     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12942       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12943   }
12944 
12945   // Already canonical.
12946   return DAG.getConstantFP(C, SL, VT);
12947 }
12948 
12949 static bool vectorEltWillFoldAway(SDValue Op) {
12950   return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12951 }
12952 
12953 SDValue SITargetLowering::performFCanonicalizeCombine(
12954   SDNode *N,
12955   DAGCombinerInfo &DCI) const {
12956   SelectionDAG &DAG = DCI.DAG;
12957   SDValue N0 = N->getOperand(0);
12958   EVT VT = N->getValueType(0);
12959 
12960   // fcanonicalize undef -> qnan
12961   if (N0.isUndef()) {
12962     APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
12963     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12964   }
12965 
12966   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12967     EVT VT = N->getValueType(0);
12968     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12969   }
12970 
12971   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12972   //                                                   (fcanonicalize k)
12973   //
12974   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12975 
12976   // TODO: This could be better with wider vectors that will be split to v2f16,
12977   // and to consider uses since there aren't that many packed operations.
12978   if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12979       isTypeLegal(MVT::v2f16)) {
12980     SDLoc SL(N);
12981     SDValue NewElts[2];
12982     SDValue Lo = N0.getOperand(0);
12983     SDValue Hi = N0.getOperand(1);
12984     EVT EltVT = Lo.getValueType();
12985 
12986     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
12987       for (unsigned I = 0; I != 2; ++I) {
12988         SDValue Op = N0.getOperand(I);
12989         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12990           NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12991                                               CFP->getValueAPF());
12992         } else if (Op.isUndef()) {
12993           // Handled below based on what the other operand is.
12994           NewElts[I] = Op;
12995         } else {
12996           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12997         }
12998       }
12999 
13000       // If one half is undef, and one is constant, prefer a splat vector rather
13001       // than the normal qNaN. If it's a register, prefer 0.0 since that's
13002       // cheaper to use and may be free with a packed operation.
13003       if (NewElts[0].isUndef()) {
13004         if (isa<ConstantFPSDNode>(NewElts[1]))
13005           NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13006             NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13007       }
13008 
13009       if (NewElts[1].isUndef()) {
13010         NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13011           NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13012       }
13013 
13014       return DAG.getBuildVector(VT, SL, NewElts);
13015     }
13016   }
13017 
13018   return SDValue();
13019 }
13020 
13021 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13022   switch (Opc) {
13023   case ISD::FMAXNUM:
13024   case ISD::FMAXNUM_IEEE:
13025     return AMDGPUISD::FMAX3;
13026   case ISD::FMAXIMUM:
13027     return AMDGPUISD::FMAXIMUM3;
13028   case ISD::SMAX:
13029     return AMDGPUISD::SMAX3;
13030   case ISD::UMAX:
13031     return AMDGPUISD::UMAX3;
13032   case ISD::FMINNUM:
13033   case ISD::FMINNUM_IEEE:
13034     return AMDGPUISD::FMIN3;
13035   case ISD::FMINIMUM:
13036     return AMDGPUISD::FMINIMUM3;
13037   case ISD::SMIN:
13038     return AMDGPUISD::SMIN3;
13039   case ISD::UMIN:
13040     return AMDGPUISD::UMIN3;
13041   default:
13042     llvm_unreachable("Not a min/max opcode");
13043   }
13044 }
13045 
13046 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13047                                                    const SDLoc &SL, SDValue Src,
13048                                                    SDValue MinVal,
13049                                                    SDValue MaxVal,
13050                                                    bool Signed) const {
13051 
13052   // med3 comes from
13053   //    min(max(x, K0), K1), K0 < K1
13054   //    max(min(x, K0), K1), K1 < K0
13055   //
13056   // "MinVal" and "MaxVal" respectively refer to the rhs of the
13057   // min/max op.
13058   ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13059   ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13060 
13061   if (!MinK || !MaxK)
13062     return SDValue();
13063 
13064   if (Signed) {
13065     if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13066       return SDValue();
13067   } else {
13068     if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13069       return SDValue();
13070   }
13071 
13072   EVT VT = MinK->getValueType(0);
13073   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13074   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13075     return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13076 
13077   // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13078   // not available, but this is unlikely to be profitable as constants
13079   // will often need to be materialized & extended, especially on
13080   // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13081   return SDValue();
13082 }
13083 
13084 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13085   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13086     return C;
13087 
13088   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13089     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13090       return C;
13091   }
13092 
13093   return nullptr;
13094 }
13095 
13096 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13097                                                   const SDLoc &SL,
13098                                                   SDValue Op0,
13099                                                   SDValue Op1) const {
13100   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
13101   if (!K1)
13102     return SDValue();
13103 
13104   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
13105   if (!K0)
13106     return SDValue();
13107 
13108   // Ordered >= (although NaN inputs should have folded away by now).
13109   if (K0->getValueAPF() > K1->getValueAPF())
13110     return SDValue();
13111 
13112   const MachineFunction &MF = DAG.getMachineFunction();
13113   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13114 
13115   // TODO: Check IEEE bit enabled?
13116   EVT VT = Op0.getValueType();
13117   if (Info->getMode().DX10Clamp) {
13118     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13119     // hardware fmed3 behavior converting to a min.
13120     // FIXME: Should this be allowing -0.0?
13121     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13122       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13123   }
13124 
13125   // med3 for f16 is only available on gfx9+, and not available for v2f16.
13126   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13127     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13128     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13129     // then give the other result, which is different from med3 with a NaN
13130     // input.
13131     SDValue Var = Op0.getOperand(0);
13132     if (!DAG.isKnownNeverSNaN(Var))
13133       return SDValue();
13134 
13135     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13136 
13137     if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13138         (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13139       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13140                          Var, SDValue(K0, 0), SDValue(K1, 0));
13141     }
13142   }
13143 
13144   return SDValue();
13145 }
13146 
13147 /// \return true if the subtarget supports minimum3 and maximum3 with the given
13148 /// base min/max opcode \p Opc for type \p VT.
13149 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13150                              EVT VT) {
13151   switch (Opc) {
13152   case ISD::FMINNUM:
13153   case ISD::FMAXNUM:
13154   case ISD::FMINNUM_IEEE:
13155   case ISD::FMAXNUM_IEEE:
13156   case AMDGPUISD::FMIN_LEGACY:
13157   case AMDGPUISD::FMAX_LEGACY:
13158     return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13159   case ISD::FMINIMUM:
13160   case ISD::FMAXIMUM:
13161     return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13162   case ISD::SMAX:
13163   case ISD::SMIN:
13164   case ISD::UMAX:
13165   case ISD::UMIN:
13166     return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13167   default:
13168     return false;
13169   }
13170 
13171   llvm_unreachable("not a min/max opcode");
13172 }
13173 
13174 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13175                                                DAGCombinerInfo &DCI) const {
13176   SelectionDAG &DAG = DCI.DAG;
13177 
13178   EVT VT = N->getValueType(0);
13179   unsigned Opc = N->getOpcode();
13180   SDValue Op0 = N->getOperand(0);
13181   SDValue Op1 = N->getOperand(1);
13182 
13183   // Only do this if the inner op has one use since this will just increases
13184   // register pressure for no benefit.
13185 
13186   if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13187     // max(max(a, b), c) -> max3(a, b, c)
13188     // min(min(a, b), c) -> min3(a, b, c)
13189     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13190       SDLoc DL(N);
13191       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13192                          DL,
13193                          N->getValueType(0),
13194                          Op0.getOperand(0),
13195                          Op0.getOperand(1),
13196                          Op1);
13197     }
13198 
13199     // Try commuted.
13200     // max(a, max(b, c)) -> max3(a, b, c)
13201     // min(a, min(b, c)) -> min3(a, b, c)
13202     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13203       SDLoc DL(N);
13204       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13205                          DL,
13206                          N->getValueType(0),
13207                          Op0,
13208                          Op1.getOperand(0),
13209                          Op1.getOperand(1));
13210     }
13211   }
13212 
13213   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13214   // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13215   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13216     if (SDValue Med3 = performIntMed3ImmCombine(
13217             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13218       return Med3;
13219   }
13220   if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13221     if (SDValue Med3 = performIntMed3ImmCombine(
13222             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13223       return Med3;
13224   }
13225 
13226   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13227     if (SDValue Med3 = performIntMed3ImmCombine(
13228             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13229       return Med3;
13230   }
13231   if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13232     if (SDValue Med3 = performIntMed3ImmCombine(
13233             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13234       return Med3;
13235   }
13236 
13237   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13238   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13239        (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13240        (Opc == AMDGPUISD::FMIN_LEGACY &&
13241         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13242       (VT == MVT::f32 || VT == MVT::f64 ||
13243        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13244        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13245       Op0.hasOneUse()) {
13246     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13247       return Res;
13248   }
13249 
13250   return SDValue();
13251 }
13252 
13253 static bool isClampZeroToOne(SDValue A, SDValue B) {
13254   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13255     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13256       // FIXME: Should this be allowing -0.0?
13257       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13258              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13259     }
13260   }
13261 
13262   return false;
13263 }
13264 
13265 // FIXME: Should only worry about snans for version with chain.
13266 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13267                                               DAGCombinerInfo &DCI) const {
13268   EVT VT = N->getValueType(0);
13269   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13270   // NaNs. With a NaN input, the order of the operands may change the result.
13271 
13272   SelectionDAG &DAG = DCI.DAG;
13273   SDLoc SL(N);
13274 
13275   SDValue Src0 = N->getOperand(0);
13276   SDValue Src1 = N->getOperand(1);
13277   SDValue Src2 = N->getOperand(2);
13278 
13279   if (isClampZeroToOne(Src0, Src1)) {
13280     // const_a, const_b, x -> clamp is safe in all cases including signaling
13281     // nans.
13282     // FIXME: Should this be allowing -0.0?
13283     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13284   }
13285 
13286   const MachineFunction &MF = DAG.getMachineFunction();
13287   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13288 
13289   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13290   // handling no dx10-clamp?
13291   if (Info->getMode().DX10Clamp) {
13292     // If NaNs is clamped to 0, we are free to reorder the inputs.
13293 
13294     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13295       std::swap(Src0, Src1);
13296 
13297     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13298       std::swap(Src1, Src2);
13299 
13300     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13301       std::swap(Src0, Src1);
13302 
13303     if (isClampZeroToOne(Src1, Src2))
13304       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13305   }
13306 
13307   return SDValue();
13308 }
13309 
13310 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13311                                                  DAGCombinerInfo &DCI) const {
13312   SDValue Src0 = N->getOperand(0);
13313   SDValue Src1 = N->getOperand(1);
13314   if (Src0.isUndef() && Src1.isUndef())
13315     return DCI.DAG.getUNDEF(N->getValueType(0));
13316   return SDValue();
13317 }
13318 
13319 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13320 // expanded into a set of cmp/select instructions.
13321 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
13322                                                 unsigned NumElem,
13323                                                 bool IsDivergentIdx,
13324                                                 const GCNSubtarget *Subtarget) {
13325   if (UseDivergentRegisterIndexing)
13326     return false;
13327 
13328   unsigned VecSize = EltSize * NumElem;
13329 
13330   // Sub-dword vectors of size 2 dword or less have better implementation.
13331   if (VecSize <= 64 && EltSize < 32)
13332     return false;
13333 
13334   // Always expand the rest of sub-dword instructions, otherwise it will be
13335   // lowered via memory.
13336   if (EltSize < 32)
13337     return true;
13338 
13339   // Always do this if var-idx is divergent, otherwise it will become a loop.
13340   if (IsDivergentIdx)
13341     return true;
13342 
13343   // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13344   unsigned NumInsts = NumElem /* Number of compares */ +
13345                       ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13346 
13347   // On some architectures (GFX9) movrel is not available and it's better
13348   // to expand.
13349   if (!Subtarget->hasMovrel())
13350     return NumInsts <= 16;
13351 
13352   // If movrel is available, use it instead of expanding for vector of 8
13353   // elements.
13354   return NumInsts <= 15;
13355 }
13356 
13357 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
13358   SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13359   if (isa<ConstantSDNode>(Idx))
13360     return false;
13361 
13362   SDValue Vec = N->getOperand(0);
13363   EVT VecVT = Vec.getValueType();
13364   EVT EltVT = VecVT.getVectorElementType();
13365   unsigned EltSize = EltVT.getSizeInBits();
13366   unsigned NumElem = VecVT.getVectorNumElements();
13367 
13368   return SITargetLowering::shouldExpandVectorDynExt(
13369       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13370 }
13371 
13372 SDValue SITargetLowering::performExtractVectorEltCombine(
13373   SDNode *N, DAGCombinerInfo &DCI) const {
13374   SDValue Vec = N->getOperand(0);
13375   SelectionDAG &DAG = DCI.DAG;
13376 
13377   EVT VecVT = Vec.getValueType();
13378   EVT VecEltVT = VecVT.getVectorElementType();
13379   EVT ResVT = N->getValueType(0);
13380 
13381   unsigned VecSize = VecVT.getSizeInBits();
13382   unsigned VecEltSize = VecEltVT.getSizeInBits();
13383 
13384   if ((Vec.getOpcode() == ISD::FNEG ||
13385        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
13386     SDLoc SL(N);
13387     SDValue Idx = N->getOperand(1);
13388     SDValue Elt =
13389         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13390     return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13391   }
13392 
13393   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13394   //    =>
13395   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13396   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13397   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13398   if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13399     SDLoc SL(N);
13400     SDValue Idx = N->getOperand(1);
13401     unsigned Opc = Vec.getOpcode();
13402 
13403     switch(Opc) {
13404     default:
13405       break;
13406       // TODO: Support other binary operations.
13407     case ISD::FADD:
13408     case ISD::FSUB:
13409     case ISD::FMUL:
13410     case ISD::ADD:
13411     case ISD::UMIN:
13412     case ISD::UMAX:
13413     case ISD::SMIN:
13414     case ISD::SMAX:
13415     case ISD::FMAXNUM:
13416     case ISD::FMINNUM:
13417     case ISD::FMAXNUM_IEEE:
13418     case ISD::FMINNUM_IEEE:
13419     case ISD::FMAXIMUM:
13420     case ISD::FMINIMUM: {
13421       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13422                                  Vec.getOperand(0), Idx);
13423       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13424                                  Vec.getOperand(1), Idx);
13425 
13426       DCI.AddToWorklist(Elt0.getNode());
13427       DCI.AddToWorklist(Elt1.getNode());
13428       return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13429     }
13430     }
13431   }
13432 
13433   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13434   if (shouldExpandVectorDynExt(N)) {
13435     SDLoc SL(N);
13436     SDValue Idx = N->getOperand(1);
13437     SDValue V;
13438     for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13439       SDValue IC = DAG.getVectorIdxConstant(I, SL);
13440       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13441       if (I == 0)
13442         V = Elt;
13443       else
13444         V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13445     }
13446     return V;
13447   }
13448 
13449   if (!DCI.isBeforeLegalize())
13450     return SDValue();
13451 
13452   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13453   // elements. This exposes more load reduction opportunities by replacing
13454   // multiple small extract_vector_elements with a single 32-bit extract.
13455   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13456   if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13457       VecSize > 32 && VecSize % 32 == 0 && Idx) {
13458     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13459 
13460     unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13461     unsigned EltIdx = BitIndex / 32;
13462     unsigned LeftoverBitIdx = BitIndex % 32;
13463     SDLoc SL(N);
13464 
13465     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13466     DCI.AddToWorklist(Cast.getNode());
13467 
13468     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13469                               DAG.getConstant(EltIdx, SL, MVT::i32));
13470     DCI.AddToWorklist(Elt.getNode());
13471     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13472                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13473     DCI.AddToWorklist(Srl.getNode());
13474 
13475     EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13476     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13477     DCI.AddToWorklist(Trunc.getNode());
13478 
13479     if (VecEltVT == ResVT) {
13480       return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13481     }
13482 
13483     assert(ResVT.isScalarInteger());
13484     return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13485   }
13486 
13487   return SDValue();
13488 }
13489 
13490 SDValue
13491 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13492                                                 DAGCombinerInfo &DCI) const {
13493   SDValue Vec = N->getOperand(0);
13494   SDValue Idx = N->getOperand(2);
13495   EVT VecVT = Vec.getValueType();
13496   EVT EltVT = VecVT.getVectorElementType();
13497 
13498   // INSERT_VECTOR_ELT (<n x e>, var-idx)
13499   // => BUILD_VECTOR n x select (e, const-idx)
13500   if (!shouldExpandVectorDynExt(N))
13501     return SDValue();
13502 
13503   SelectionDAG &DAG = DCI.DAG;
13504   SDLoc SL(N);
13505   SDValue Ins = N->getOperand(1);
13506   EVT IdxVT = Idx.getValueType();
13507 
13508   SmallVector<SDValue, 16> Ops;
13509   for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13510     SDValue IC = DAG.getConstant(I, SL, IdxVT);
13511     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13512     SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13513     Ops.push_back(V);
13514   }
13515 
13516   return DAG.getBuildVector(VecVT, SL, Ops);
13517 }
13518 
13519 /// Return the source of an fp_extend from f16 to f32, or a converted FP
13520 /// constant.
13521 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
13522   if (Src.getOpcode() == ISD::FP_EXTEND &&
13523       Src.getOperand(0).getValueType() == MVT::f16) {
13524     return Src.getOperand(0);
13525   }
13526 
13527   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13528     APFloat Val = CFP->getValueAPF();
13529     bool LosesInfo = true;
13530     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
13531     if (!LosesInfo)
13532       return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13533   }
13534 
13535   return SDValue();
13536 }
13537 
13538 SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13539                                                 DAGCombinerInfo &DCI) const {
13540   assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13541          "combine only useful on gfx8");
13542 
13543   SDValue TruncSrc = N->getOperand(0);
13544   EVT VT = N->getValueType(0);
13545   if (VT != MVT::f16)
13546     return SDValue();
13547 
13548   if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13549       TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13550     return SDValue();
13551 
13552   SelectionDAG &DAG = DCI.DAG;
13553   SDLoc SL(N);
13554 
13555   // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13556   // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13557   // casting back.
13558 
13559   // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13560   // fmin(fmax(a, b), fmax(fmin(a, b), c))
13561   SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13562   if (!A)
13563     return SDValue();
13564 
13565   SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13566   if (!B)
13567     return SDValue();
13568 
13569   SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13570   if (!C)
13571     return SDValue();
13572 
13573   // This changes signaling nan behavior. If an input is a signaling nan, it
13574   // would have been quieted by the fpext originally. We don't care because
13575   // these are unconstrained ops. If we needed to insert quieting canonicalizes
13576   // we would be worse off than just doing the promotion.
13577   SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13578   SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13579   SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13580   return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13581 }
13582 
13583 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13584                                           const SDNode *N0,
13585                                           const SDNode *N1) const {
13586   EVT VT = N0->getValueType(0);
13587 
13588   // Only do this if we are not trying to support denormals. v_mad_f32 does not
13589   // support denormals ever.
13590   if (((VT == MVT::f32 &&
13591         denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
13592        (VT == MVT::f16 && Subtarget->hasMadF16() &&
13593         denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
13594       isOperationLegal(ISD::FMAD, VT))
13595     return ISD::FMAD;
13596 
13597   const TargetOptions &Options = DAG.getTarget().Options;
13598   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13599        (N0->getFlags().hasAllowContract() &&
13600         N1->getFlags().hasAllowContract())) &&
13601       isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
13602     return ISD::FMA;
13603   }
13604 
13605   return 0;
13606 }
13607 
13608 // For a reassociatable opcode perform:
13609 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13610 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13611                                                SelectionDAG &DAG) const {
13612   EVT VT = N->getValueType(0);
13613   if (VT != MVT::i32 && VT != MVT::i64)
13614     return SDValue();
13615 
13616   if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13617     return SDValue();
13618 
13619   unsigned Opc = N->getOpcode();
13620   SDValue Op0 = N->getOperand(0);
13621   SDValue Op1 = N->getOperand(1);
13622 
13623   if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13624     return SDValue();
13625 
13626   if (Op0->isDivergent())
13627     std::swap(Op0, Op1);
13628 
13629   if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13630     return SDValue();
13631 
13632   SDValue Op2 = Op1.getOperand(1);
13633   Op1 = Op1.getOperand(0);
13634   if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13635     return SDValue();
13636 
13637   if (Op1->isDivergent())
13638     std::swap(Op1, Op2);
13639 
13640   SDLoc SL(N);
13641   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13642   return DAG.getNode(Opc, SL, VT, Add1, Op2);
13643 }
13644 
13645 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13646                            EVT VT,
13647                            SDValue N0, SDValue N1, SDValue N2,
13648                            bool Signed) {
13649   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
13650   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13651   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13652   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13653 }
13654 
13655 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13656 // multiplies, if any.
13657 //
13658 // Full 64-bit multiplies that feed into an addition are lowered here instead
13659 // of using the generic expansion. The generic expansion ends up with
13660 // a tree of ADD nodes that prevents us from using the "add" part of the
13661 // MAD instruction. The expansion produced here results in a chain of ADDs
13662 // instead of a tree.
13663 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13664                                             DAGCombinerInfo &DCI) const {
13665   assert(N->getOpcode() == ISD::ADD);
13666 
13667   SelectionDAG &DAG = DCI.DAG;
13668   EVT VT = N->getValueType(0);
13669   SDLoc SL(N);
13670   SDValue LHS = N->getOperand(0);
13671   SDValue RHS = N->getOperand(1);
13672 
13673   if (VT.isVector())
13674     return SDValue();
13675 
13676   // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13677   // result in scalar registers for uniform values.
13678   if (!N->isDivergent() && Subtarget->hasSMulHi())
13679     return SDValue();
13680 
13681   unsigned NumBits = VT.getScalarSizeInBits();
13682   if (NumBits <= 32 || NumBits > 64)
13683     return SDValue();
13684 
13685   if (LHS.getOpcode() != ISD::MUL) {
13686     assert(RHS.getOpcode() == ISD::MUL);
13687     std::swap(LHS, RHS);
13688   }
13689 
13690   // Avoid the fold if it would unduly increase the number of multiplies due to
13691   // multiple uses, except on hardware with full-rate multiply-add (which is
13692   // part of full-rate 64-bit ops).
13693   if (!Subtarget->hasFullRate64Ops()) {
13694     unsigned NumUsers = 0;
13695     for (SDNode *Use : LHS->uses()) {
13696       // There is a use that does not feed into addition, so the multiply can't
13697       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13698       if (Use->getOpcode() != ISD::ADD)
13699         return SDValue();
13700 
13701       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13702       // MUL + 3xADD + 3xADDC over 3xMAD.
13703       ++NumUsers;
13704       if (NumUsers >= 3)
13705         return SDValue();
13706     }
13707   }
13708 
13709   SDValue MulLHS = LHS.getOperand(0);
13710   SDValue MulRHS = LHS.getOperand(1);
13711   SDValue AddRHS = RHS;
13712 
13713   // Always check whether operands are small unsigned values, since that
13714   // knowledge is useful in more cases. Check for small signed values only if
13715   // doing so can unlock a shorter code sequence.
13716   bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13717   bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13718 
13719   bool MulSignedLo = false;
13720   if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13721     MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13722                   numBitsSigned(MulRHS, DAG) <= 32;
13723   }
13724 
13725   // The operands and final result all have the same number of bits. If
13726   // operands need to be extended, they can be extended with garbage. The
13727   // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13728   // truncated away in the end.
13729   if (VT != MVT::i64) {
13730     MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13731     MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13732     AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13733   }
13734 
13735   // The basic code generated is conceptually straightforward. Pseudo code:
13736   //
13737   //   accum = mad_64_32 lhs.lo, rhs.lo, accum
13738   //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13739   //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13740   //
13741   // The second and third lines are optional, depending on whether the factors
13742   // are {sign,zero}-extended or not.
13743   //
13744   // The actual DAG is noisier than the pseudo code, but only due to
13745   // instructions that disassemble values into low and high parts, and
13746   // assemble the final result.
13747   SDValue One = DAG.getConstant(1, SL, MVT::i32);
13748 
13749   auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13750   auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13751   SDValue Accum =
13752       getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13753 
13754   if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13755     SDValue AccumLo, AccumHi;
13756     std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13757 
13758     if (!MulLHSUnsigned32) {
13759       auto MulLHSHi =
13760           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13761       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13762       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13763     }
13764 
13765     if (!MulRHSUnsigned32) {
13766       auto MulRHSHi =
13767           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13768       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13769       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13770     }
13771 
13772     Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13773     Accum = DAG.getBitcast(MVT::i64, Accum);
13774   }
13775 
13776   if (VT != MVT::i64)
13777     Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13778   return Accum;
13779 }
13780 
13781 // Collect the ultimate src of each of the mul node's operands, and confirm
13782 // each operand is 8 bytes.
13783 static std::optional<ByteProvider<SDValue>>
13784 handleMulOperand(const SDValue &MulOperand) {
13785   auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13786   if (!Byte0 || Byte0->isConstantZero()) {
13787     return std::nullopt;
13788   }
13789   auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13790   if (Byte1 && !Byte1->isConstantZero()) {
13791     return std::nullopt;
13792   }
13793   return Byte0;
13794 }
13795 
13796 static unsigned addPermMasks(unsigned First, unsigned Second) {
13797   unsigned FirstCs = First & 0x0c0c0c0c;
13798   unsigned SecondCs = Second & 0x0c0c0c0c;
13799   unsigned FirstNoCs = First & ~0x0c0c0c0c;
13800   unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13801 
13802   assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13803   assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13804   assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13805   assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13806 
13807   return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13808 }
13809 
13810 struct DotSrc {
13811   SDValue SrcOp;
13812   int64_t PermMask;
13813   int64_t DWordOffset;
13814 };
13815 
13816 static void placeSources(ByteProvider<SDValue> &Src0,
13817                          ByteProvider<SDValue> &Src1,
13818                          SmallVectorImpl<DotSrc> &Src0s,
13819                          SmallVectorImpl<DotSrc> &Src1s, int Step) {
13820 
13821   assert(Src0.Src.has_value() && Src1.Src.has_value());
13822   // Src0s and Src1s are empty, just place arbitrarily.
13823   if (Step == 0) {
13824     Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13825                      Src0.SrcOffset / 4});
13826     Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13827                      Src1.SrcOffset / 4});
13828     return;
13829   }
13830 
13831   for (int BPI = 0; BPI < 2; BPI++) {
13832     std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13833     if (BPI == 1) {
13834       BPP = {Src1, Src0};
13835     }
13836     unsigned ZeroMask = 0x0c0c0c0c;
13837     unsigned FMask = 0xFF << (8 * (3 - Step));
13838 
13839     unsigned FirstMask =
13840         (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13841     unsigned SecondMask =
13842         (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13843     // Attempt to find Src vector which contains our SDValue, if so, add our
13844     // perm mask to the existing one. If we are unable to find a match for the
13845     // first SDValue, attempt to find match for the second.
13846     int FirstGroup = -1;
13847     for (int I = 0; I < 2; I++) {
13848       SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13849       auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13850         return IterElt.SrcOp == *BPP.first.Src &&
13851                (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13852       };
13853 
13854       auto Match = llvm::find_if(Srcs, MatchesFirst);
13855       if (Match != Srcs.end()) {
13856         Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13857         FirstGroup = I;
13858         break;
13859       }
13860     }
13861     if (FirstGroup != -1) {
13862       SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13863       auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13864         return IterElt.SrcOp == *BPP.second.Src &&
13865                (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13866       };
13867       auto Match = llvm::find_if(Srcs, MatchesSecond);
13868       if (Match != Srcs.end()) {
13869         Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13870       } else
13871         Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13872       return;
13873     }
13874   }
13875 
13876   // If we have made it here, then we could not find a match in Src0s or Src1s
13877   // for either Src0 or Src1, so just place them arbitrarily.
13878 
13879   unsigned ZeroMask = 0x0c0c0c0c;
13880   unsigned FMask = 0xFF << (8 * (3 - Step));
13881 
13882   Src0s.push_back(
13883       {*Src0.Src,
13884        ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13885        Src1.SrcOffset / 4});
13886   Src1s.push_back(
13887       {*Src1.Src,
13888        ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13889        Src1.SrcOffset / 4});
13890 
13891   return;
13892 }
13893 
13894 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
13895                               SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13896                               bool IsAny) {
13897 
13898   // If we just have one source, just permute it accordingly.
13899   if (Srcs.size() == 1) {
13900     auto Elt = Srcs.begin();
13901     auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13902 
13903     // v_perm will produce the original value
13904     if (Elt->PermMask == 0x3020100)
13905       return EltOp;
13906 
13907     return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13908                        DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13909   }
13910 
13911   auto FirstElt = Srcs.begin();
13912   auto SecondElt = std::next(FirstElt);
13913 
13914   SmallVector<SDValue, 2> Perms;
13915 
13916   // If we have multiple sources in the chain, combine them via perms (using
13917   // calculated perm mask) and Ors.
13918   while (true) {
13919     auto FirstMask = FirstElt->PermMask;
13920     auto SecondMask = SecondElt->PermMask;
13921 
13922     unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13923     unsigned FirstPlusFour = FirstMask | 0x04040404;
13924     // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13925     // original 0x0C.
13926     FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13927 
13928     auto PermMask = addPermMasks(FirstMask, SecondMask);
13929     auto FirstVal =
13930         getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13931     auto SecondVal =
13932         getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13933 
13934     Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13935                                 SecondVal,
13936                                 DAG.getConstant(PermMask, SL, MVT::i32)));
13937 
13938     FirstElt = std::next(SecondElt);
13939     if (FirstElt == Srcs.end())
13940       break;
13941 
13942     SecondElt = std::next(FirstElt);
13943     // If we only have a FirstElt, then just combine that into the cumulative
13944     // source node.
13945     if (SecondElt == Srcs.end()) {
13946       auto EltOp =
13947           getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13948 
13949       Perms.push_back(
13950           DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13951                       DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13952       break;
13953     }
13954   }
13955 
13956   assert(Perms.size() == 1 || Perms.size() == 2);
13957   return Perms.size() == 2
13958              ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13959              : Perms[0];
13960 }
13961 
13962 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13963   for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13964     EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13965     auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13966     EntryMask += ZeroMask;
13967   }
13968 }
13969 
13970 static bool isMul(const SDValue Op) {
13971   auto Opcode = Op.getOpcode();
13972 
13973   return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13974           Opcode == AMDGPUISD::MUL_I24);
13975 }
13976 
13977 static std::optional<bool>
13978 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
13979                        ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13980                        const SDValue &S1Op, const SelectionDAG &DAG) {
13981   // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13982   // of the dot4 is irrelevant.
13983   if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13984     return false;
13985 
13986   auto Known0 = DAG.computeKnownBits(S0Op, 0);
13987   bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13988   bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13989   auto Known1 = DAG.computeKnownBits(S1Op, 0);
13990   bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13991   bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13992 
13993   assert(!(S0IsUnsigned && S0IsSigned));
13994   assert(!(S1IsUnsigned && S1IsSigned));
13995 
13996   // There are 9 possible permutations of
13997   // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13998 
13999   // In two permutations, the sign bits are known to be the same for both Ops,
14000   // so simply return Signed / Unsigned corresponding to the MSB
14001 
14002   if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14003     return S0IsSigned;
14004 
14005   // In another two permutations, the sign bits are known to be opposite. In
14006   // this case return std::nullopt to indicate a bad match.
14007 
14008   if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14009     return std::nullopt;
14010 
14011   // In the remaining five permutations, we don't know the value of the sign
14012   // bit for at least one Op. Since we have a valid ByteProvider, we know that
14013   // the upper bits must be extension bits. Thus, the only ways for the sign
14014   // bit to be unknown is if it was sign extended from unknown value, or if it
14015   // was any extended. In either case, it is correct to use the signed
14016   // version of the signedness semantics of dot4
14017 
14018   // In two of such permutations, we known the sign bit is set for
14019   // one op, and the other is unknown. It is okay to used signed version of
14020   // dot4.
14021   if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14022       ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14023     return true;
14024 
14025   // In one such permutation, we don't know either of the sign bits. It is okay
14026   // to used the signed version of dot4.
14027   if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14028     return true;
14029 
14030   // In two of such permutations, we known the sign bit is unset for
14031   // one op, and the other is unknown. Return std::nullopt to indicate a
14032   // bad match.
14033   if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14034       ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14035     return std::nullopt;
14036 
14037   llvm_unreachable("Fully covered condition");
14038 }
14039 
14040 SDValue SITargetLowering::performAddCombine(SDNode *N,
14041                                             DAGCombinerInfo &DCI) const {
14042   SelectionDAG &DAG = DCI.DAG;
14043   EVT VT = N->getValueType(0);
14044   SDLoc SL(N);
14045   SDValue LHS = N->getOperand(0);
14046   SDValue RHS = N->getOperand(1);
14047 
14048   if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14049     if (Subtarget->hasMad64_32()) {
14050       if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14051         return Folded;
14052     }
14053   }
14054 
14055   if (SDValue V = reassociateScalarOps(N, DAG)) {
14056     return V;
14057   }
14058 
14059   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14060       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14061     SDValue TempNode(N, 0);
14062     std::optional<bool> IsSigned;
14063     SmallVector<DotSrc, 4> Src0s;
14064     SmallVector<DotSrc, 4> Src1s;
14065     SmallVector<SDValue, 4> Src2s;
14066 
14067     // Match the v_dot4 tree, while collecting src nodes.
14068     int ChainLength = 0;
14069     for (int I = 0; I < 4; I++) {
14070       auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14071       if (MulIdx == -1)
14072         break;
14073       auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14074       if (!Src0)
14075         break;
14076       auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14077       if (!Src1)
14078         break;
14079 
14080       auto IterIsSigned = checkDot4MulSignedness(
14081           TempNode->getOperand(MulIdx), *Src0, *Src1,
14082           TempNode->getOperand(MulIdx)->getOperand(0),
14083           TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14084       if (!IterIsSigned)
14085         break;
14086       if (!IsSigned)
14087         IsSigned = *IterIsSigned;
14088       if (*IterIsSigned != *IsSigned)
14089         break;
14090       placeSources(*Src0, *Src1, Src0s, Src1s, I);
14091       auto AddIdx = 1 - MulIdx;
14092       // Allow the special case where add (add (mul24, 0), mul24) became ->
14093       // add (mul24, mul24).
14094       if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14095         Src2s.push_back(TempNode->getOperand(AddIdx));
14096         auto Src0 =
14097             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14098         if (!Src0)
14099           break;
14100         auto Src1 =
14101             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14102         if (!Src1)
14103           break;
14104         auto IterIsSigned = checkDot4MulSignedness(
14105             TempNode->getOperand(AddIdx), *Src0, *Src1,
14106             TempNode->getOperand(AddIdx)->getOperand(0),
14107             TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14108         if (!IterIsSigned)
14109           break;
14110         assert(IsSigned);
14111         if (*IterIsSigned != *IsSigned)
14112           break;
14113         placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14114         Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14115         ChainLength = I + 2;
14116         break;
14117       }
14118 
14119       TempNode = TempNode->getOperand(AddIdx);
14120       Src2s.push_back(TempNode);
14121       ChainLength = I + 1;
14122       if (TempNode->getNumOperands() < 2)
14123         break;
14124       LHS = TempNode->getOperand(0);
14125       RHS = TempNode->getOperand(1);
14126     }
14127 
14128     if (ChainLength < 2)
14129       return SDValue();
14130 
14131     // Masks were constructed with assumption that we would find a chain of
14132     // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14133     // 0x0c) so they do not affect dot calculation.
14134     if (ChainLength < 4) {
14135       fixMasks(Src0s, ChainLength);
14136       fixMasks(Src1s, ChainLength);
14137     }
14138 
14139     SDValue Src0, Src1;
14140 
14141     // If we are just using a single source for both, and have permuted the
14142     // bytes consistently, we can just use the sources without permuting
14143     // (commutation).
14144     bool UseOriginalSrc = false;
14145     if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14146         Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14147         Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14148         Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14149       SmallVector<unsigned, 4> SrcBytes;
14150       auto Src0Mask = Src0s.begin()->PermMask;
14151       SrcBytes.push_back(Src0Mask & 0xFF000000);
14152       bool UniqueEntries = true;
14153       for (auto I = 1; I < 4; I++) {
14154         auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14155 
14156         if (is_contained(SrcBytes, NextByte)) {
14157           UniqueEntries = false;
14158           break;
14159         }
14160         SrcBytes.push_back(NextByte);
14161       }
14162 
14163       if (UniqueEntries) {
14164         UseOriginalSrc = true;
14165 
14166         auto FirstElt = Src0s.begin();
14167         auto FirstEltOp =
14168             getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14169 
14170         auto SecondElt = Src1s.begin();
14171         auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14172                                               SecondElt->DWordOffset);
14173 
14174         Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14175                                              MVT::getIntegerVT(32));
14176         Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14177                                              MVT::getIntegerVT(32));
14178       }
14179     }
14180 
14181     if (!UseOriginalSrc) {
14182       Src0 = resolveSources(DAG, SL, Src0s, false, true);
14183       Src1 = resolveSources(DAG, SL, Src1s, false, true);
14184     }
14185 
14186     assert(IsSigned);
14187     SDValue Src2 =
14188         DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14189 
14190     SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14191                                                   : Intrinsic::amdgcn_udot4,
14192                                         SL, MVT::i64);
14193 
14194     assert(!VT.isVector());
14195     auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14196                            Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14197 
14198     return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14199   }
14200 
14201   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14202     return SDValue();
14203 
14204   // add x, zext (setcc) => uaddo_carry x, 0, setcc
14205   // add x, sext (setcc) => usubo_carry x, 0, setcc
14206   unsigned Opc = LHS.getOpcode();
14207   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14208       Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14209     std::swap(RHS, LHS);
14210 
14211   Opc = RHS.getOpcode();
14212   switch (Opc) {
14213   default: break;
14214   case ISD::ZERO_EXTEND:
14215   case ISD::SIGN_EXTEND:
14216   case ISD::ANY_EXTEND: {
14217     auto Cond = RHS.getOperand(0);
14218     // If this won't be a real VOPC output, we would still need to insert an
14219     // extra instruction anyway.
14220     if (!isBoolSGPR(Cond))
14221       break;
14222     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14223     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14224     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14225     return DAG.getNode(Opc, SL, VTList, Args);
14226   }
14227   case ISD::UADDO_CARRY: {
14228     // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14229     if (!isNullConstant(RHS.getOperand(1)))
14230       break;
14231     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14232     return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14233   }
14234   }
14235   return SDValue();
14236 }
14237 
14238 SDValue SITargetLowering::performSubCombine(SDNode *N,
14239                                             DAGCombinerInfo &DCI) const {
14240   SelectionDAG &DAG = DCI.DAG;
14241   EVT VT = N->getValueType(0);
14242 
14243   if (VT != MVT::i32)
14244     return SDValue();
14245 
14246   SDLoc SL(N);
14247   SDValue LHS = N->getOperand(0);
14248   SDValue RHS = N->getOperand(1);
14249 
14250   // sub x, zext (setcc) => usubo_carry x, 0, setcc
14251   // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14252   unsigned Opc = RHS.getOpcode();
14253   switch (Opc) {
14254   default: break;
14255   case ISD::ZERO_EXTEND:
14256   case ISD::SIGN_EXTEND:
14257   case ISD::ANY_EXTEND: {
14258     auto Cond = RHS.getOperand(0);
14259     // If this won't be a real VOPC output, we would still need to insert an
14260     // extra instruction anyway.
14261     if (!isBoolSGPR(Cond))
14262       break;
14263     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14264     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14265     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14266     return DAG.getNode(Opc, SL, VTList, Args);
14267   }
14268   }
14269 
14270   if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14271     // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14272     if (!isNullConstant(LHS.getOperand(1)))
14273       return SDValue();
14274     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14275     return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14276   }
14277   return SDValue();
14278 }
14279 
14280 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14281   DAGCombinerInfo &DCI) const {
14282 
14283   if (N->getValueType(0) != MVT::i32)
14284     return SDValue();
14285 
14286   if (!isNullConstant(N->getOperand(1)))
14287     return SDValue();
14288 
14289   SelectionDAG &DAG = DCI.DAG;
14290   SDValue LHS = N->getOperand(0);
14291 
14292   // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14293   // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14294   unsigned LHSOpc = LHS.getOpcode();
14295   unsigned Opc = N->getOpcode();
14296   if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14297       (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14298     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14299     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14300   }
14301   return SDValue();
14302 }
14303 
14304 SDValue SITargetLowering::performFAddCombine(SDNode *N,
14305                                              DAGCombinerInfo &DCI) const {
14306   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14307     return SDValue();
14308 
14309   SelectionDAG &DAG = DCI.DAG;
14310   EVT VT = N->getValueType(0);
14311 
14312   SDLoc SL(N);
14313   SDValue LHS = N->getOperand(0);
14314   SDValue RHS = N->getOperand(1);
14315 
14316   // These should really be instruction patterns, but writing patterns with
14317   // source modifiers is a pain.
14318 
14319   // fadd (fadd (a, a), b) -> mad 2.0, a, b
14320   if (LHS.getOpcode() == ISD::FADD) {
14321     SDValue A = LHS.getOperand(0);
14322     if (A == LHS.getOperand(1)) {
14323       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14324       if (FusedOp != 0) {
14325         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14326         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14327       }
14328     }
14329   }
14330 
14331   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14332   if (RHS.getOpcode() == ISD::FADD) {
14333     SDValue A = RHS.getOperand(0);
14334     if (A == RHS.getOperand(1)) {
14335       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14336       if (FusedOp != 0) {
14337         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14338         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14339       }
14340     }
14341   }
14342 
14343   return SDValue();
14344 }
14345 
14346 SDValue SITargetLowering::performFSubCombine(SDNode *N,
14347                                              DAGCombinerInfo &DCI) const {
14348   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14349     return SDValue();
14350 
14351   SelectionDAG &DAG = DCI.DAG;
14352   SDLoc SL(N);
14353   EVT VT = N->getValueType(0);
14354   assert(!VT.isVector());
14355 
14356   // Try to get the fneg to fold into the source modifier. This undoes generic
14357   // DAG combines and folds them into the mad.
14358   //
14359   // Only do this if we are not trying to support denormals. v_mad_f32 does
14360   // not support denormals ever.
14361   SDValue LHS = N->getOperand(0);
14362   SDValue RHS = N->getOperand(1);
14363   if (LHS.getOpcode() == ISD::FADD) {
14364     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14365     SDValue A = LHS.getOperand(0);
14366     if (A == LHS.getOperand(1)) {
14367       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14368       if (FusedOp != 0){
14369         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14370         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14371 
14372         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14373       }
14374     }
14375   }
14376 
14377   if (RHS.getOpcode() == ISD::FADD) {
14378     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14379 
14380     SDValue A = RHS.getOperand(0);
14381     if (A == RHS.getOperand(1)) {
14382       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14383       if (FusedOp != 0){
14384         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14385         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14386       }
14387     }
14388   }
14389 
14390   return SDValue();
14391 }
14392 
14393 SDValue SITargetLowering::performFDivCombine(SDNode *N,
14394                                              DAGCombinerInfo &DCI) const {
14395   SelectionDAG &DAG = DCI.DAG;
14396   SDLoc SL(N);
14397   EVT VT = N->getValueType(0);
14398   if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14399     return SDValue();
14400 
14401   SDValue LHS = N->getOperand(0);
14402   SDValue RHS = N->getOperand(1);
14403 
14404   SDNodeFlags Flags = N->getFlags();
14405   SDNodeFlags RHSFlags = RHS->getFlags();
14406   if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14407       !RHS->hasOneUse())
14408     return SDValue();
14409 
14410   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14411     bool IsNegative = false;
14412     if (CLHS->isExactlyValue(1.0) ||
14413         (IsNegative = CLHS->isExactlyValue(-1.0))) {
14414       // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14415       // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14416       if (RHS.getOpcode() == ISD::FSQRT) {
14417         // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14418         SDValue Rsq =
14419             DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14420         return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14421       }
14422     }
14423   }
14424 
14425   return SDValue();
14426 }
14427 
14428 SDValue SITargetLowering::performFMACombine(SDNode *N,
14429                                             DAGCombinerInfo &DCI) const {
14430   SelectionDAG &DAG = DCI.DAG;
14431   EVT VT = N->getValueType(0);
14432   SDLoc SL(N);
14433 
14434   if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14435     return SDValue();
14436 
14437   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14438   //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14439   SDValue Op1 = N->getOperand(0);
14440   SDValue Op2 = N->getOperand(1);
14441   SDValue FMA = N->getOperand(2);
14442 
14443   if (FMA.getOpcode() != ISD::FMA ||
14444       Op1.getOpcode() != ISD::FP_EXTEND ||
14445       Op2.getOpcode() != ISD::FP_EXTEND)
14446     return SDValue();
14447 
14448   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14449   // regardless of the denorm mode setting. Therefore,
14450   // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14451   const TargetOptions &Options = DAG.getTarget().Options;
14452   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14453       (N->getFlags().hasAllowContract() &&
14454        FMA->getFlags().hasAllowContract())) {
14455     Op1 = Op1.getOperand(0);
14456     Op2 = Op2.getOperand(0);
14457     if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14458         Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14459       return SDValue();
14460 
14461     SDValue Vec1 = Op1.getOperand(0);
14462     SDValue Idx1 = Op1.getOperand(1);
14463     SDValue Vec2 = Op2.getOperand(0);
14464 
14465     SDValue FMAOp1 = FMA.getOperand(0);
14466     SDValue FMAOp2 = FMA.getOperand(1);
14467     SDValue FMAAcc = FMA.getOperand(2);
14468 
14469     if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14470         FMAOp2.getOpcode() != ISD::FP_EXTEND)
14471       return SDValue();
14472 
14473     FMAOp1 = FMAOp1.getOperand(0);
14474     FMAOp2 = FMAOp2.getOperand(0);
14475     if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14476         FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14477       return SDValue();
14478 
14479     SDValue Vec3 = FMAOp1.getOperand(0);
14480     SDValue Vec4 = FMAOp2.getOperand(0);
14481     SDValue Idx2 = FMAOp1.getOperand(1);
14482 
14483     if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14484         // Idx1 and Idx2 cannot be the same.
14485         Idx1 == Idx2)
14486       return SDValue();
14487 
14488     if (Vec1 == Vec2 || Vec3 == Vec4)
14489       return SDValue();
14490 
14491     if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14492       return SDValue();
14493 
14494     if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14495         (Vec1 == Vec4 && Vec2 == Vec3)) {
14496       return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14497                          DAG.getTargetConstant(0, SL, MVT::i1));
14498     }
14499   }
14500   return SDValue();
14501 }
14502 
14503 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14504                                               DAGCombinerInfo &DCI) const {
14505   SelectionDAG &DAG = DCI.DAG;
14506   SDLoc SL(N);
14507 
14508   SDValue LHS = N->getOperand(0);
14509   SDValue RHS = N->getOperand(1);
14510   EVT VT = LHS.getValueType();
14511   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14512 
14513   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14514   if (!CRHS) {
14515     CRHS = dyn_cast<ConstantSDNode>(LHS);
14516     if (CRHS) {
14517       std::swap(LHS, RHS);
14518       CC = getSetCCSwappedOperands(CC);
14519     }
14520   }
14521 
14522   if (CRHS) {
14523     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14524         isBoolSGPR(LHS.getOperand(0))) {
14525       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14526       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14527       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
14528       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
14529       if ((CRHS->isAllOnes() &&
14530            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14531           (CRHS->isZero() &&
14532            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14533         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14534                            DAG.getConstant(-1, SL, MVT::i1));
14535       if ((CRHS->isAllOnes() &&
14536            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14537           (CRHS->isZero() &&
14538            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14539         return LHS.getOperand(0);
14540     }
14541 
14542     const APInt &CRHSVal = CRHS->getAPIntValue();
14543     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14544         LHS.getOpcode() == ISD::SELECT &&
14545         isa<ConstantSDNode>(LHS.getOperand(1)) &&
14546         isa<ConstantSDNode>(LHS.getOperand(2)) &&
14547         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14548         isBoolSGPR(LHS.getOperand(0))) {
14549       // Given CT != FT:
14550       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14551       // setcc (select cc, CT, CF), CF, ne => cc
14552       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14553       // setcc (select cc, CT, CF), CT, eq => cc
14554       const APInt &CT = LHS.getConstantOperandAPInt(1);
14555       const APInt &CF = LHS.getConstantOperandAPInt(2);
14556 
14557       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14558           (CT == CRHSVal && CC == ISD::SETNE))
14559         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14560                            DAG.getConstant(-1, SL, MVT::i1));
14561       if ((CF == CRHSVal && CC == ISD::SETNE) ||
14562           (CT == CRHSVal && CC == ISD::SETEQ))
14563         return LHS.getOperand(0);
14564     }
14565   }
14566 
14567   if (VT != MVT::f32 && VT != MVT::f64 &&
14568       (!Subtarget->has16BitInsts() || VT != MVT::f16))
14569     return SDValue();
14570 
14571   // Match isinf/isfinite pattern
14572   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14573   // (fcmp one (fabs x), inf) -> (fp_class x,
14574   // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14575   if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14576     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14577     if (!CRHS)
14578       return SDValue();
14579 
14580     const APFloat &APF = CRHS->getValueAPF();
14581     if (APF.isInfinity() && !APF.isNegative()) {
14582       const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14583                                  SIInstrFlags::N_INFINITY;
14584       const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14585                                     SIInstrFlags::P_ZERO |
14586                                     SIInstrFlags::N_NORMAL |
14587                                     SIInstrFlags::P_NORMAL |
14588                                     SIInstrFlags::N_SUBNORMAL |
14589                                     SIInstrFlags::P_SUBNORMAL;
14590       unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14591       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14592                          DAG.getConstant(Mask, SL, MVT::i32));
14593     }
14594   }
14595 
14596   return SDValue();
14597 }
14598 
14599 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14600                                                      DAGCombinerInfo &DCI) const {
14601   SelectionDAG &DAG = DCI.DAG;
14602   SDLoc SL(N);
14603   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14604 
14605   SDValue Src = N->getOperand(0);
14606   SDValue Shift = N->getOperand(0);
14607 
14608   // TODO: Extend type shouldn't matter (assuming legal types).
14609   if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14610     Shift = Shift.getOperand(0);
14611 
14612   if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14613     // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
14614     // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14615     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14616     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14617     // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
14618     if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14619       SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14620                                  SDLoc(Shift.getOperand(0)), MVT::i32);
14621 
14622       unsigned ShiftOffset = 8 * Offset;
14623       if (Shift.getOpcode() == ISD::SHL)
14624         ShiftOffset -= C->getZExtValue();
14625       else
14626         ShiftOffset += C->getZExtValue();
14627 
14628       if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14629         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14630                            MVT::f32, Shifted);
14631       }
14632     }
14633   }
14634 
14635   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14636   APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14637   if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14638     // We simplified Src. If this node is not dead, visit it again so it is
14639     // folded properly.
14640     if (N->getOpcode() != ISD::DELETED_NODE)
14641       DCI.AddToWorklist(N);
14642     return SDValue(N, 0);
14643   }
14644 
14645   // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14646   if (SDValue DemandedSrc =
14647           TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
14648     return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14649 
14650   return SDValue();
14651 }
14652 
14653 SDValue SITargetLowering::performClampCombine(SDNode *N,
14654                                               DAGCombinerInfo &DCI) const {
14655   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14656   if (!CSrc)
14657     return SDValue();
14658 
14659   const MachineFunction &MF = DCI.DAG.getMachineFunction();
14660   const APFloat &F = CSrc->getValueAPF();
14661   APFloat Zero = APFloat::getZero(F.getSemantics());
14662   if (F < Zero ||
14663       (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14664     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14665   }
14666 
14667   APFloat One(F.getSemantics(), "1.0");
14668   if (F > One)
14669     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14670 
14671   return SDValue(CSrc, 0);
14672 }
14673 
14674 
14675 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
14676                                             DAGCombinerInfo &DCI) const {
14677   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14678     return SDValue();
14679   switch (N->getOpcode()) {
14680   case ISD::ADD:
14681     return performAddCombine(N, DCI);
14682   case ISD::SUB:
14683     return performSubCombine(N, DCI);
14684   case ISD::UADDO_CARRY:
14685   case ISD::USUBO_CARRY:
14686     return performAddCarrySubCarryCombine(N, DCI);
14687   case ISD::FADD:
14688     return performFAddCombine(N, DCI);
14689   case ISD::FSUB:
14690     return performFSubCombine(N, DCI);
14691   case ISD::FDIV:
14692     return performFDivCombine(N, DCI);
14693   case ISD::SETCC:
14694     return performSetCCCombine(N, DCI);
14695   case ISD::FMAXNUM:
14696   case ISD::FMINNUM:
14697   case ISD::FMAXNUM_IEEE:
14698   case ISD::FMINNUM_IEEE:
14699   case ISD::FMAXIMUM:
14700   case ISD::FMINIMUM:
14701   case ISD::SMAX:
14702   case ISD::SMIN:
14703   case ISD::UMAX:
14704   case ISD::UMIN:
14705   case AMDGPUISD::FMIN_LEGACY:
14706   case AMDGPUISD::FMAX_LEGACY:
14707     return performMinMaxCombine(N, DCI);
14708   case ISD::FMA:
14709     return performFMACombine(N, DCI);
14710   case ISD::AND:
14711     return performAndCombine(N, DCI);
14712   case ISD::OR:
14713     return performOrCombine(N, DCI);
14714   case ISD::FSHR: {
14715     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14716     if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14717         TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14718       return matchPERM(N, DCI);
14719     }
14720     break;
14721   }
14722   case ISD::XOR:
14723     return performXorCombine(N, DCI);
14724   case ISD::ZERO_EXTEND:
14725     return performZeroExtendCombine(N, DCI);
14726   case ISD::SIGN_EXTEND_INREG:
14727     return performSignExtendInRegCombine(N , DCI);
14728   case AMDGPUISD::FP_CLASS:
14729     return performClassCombine(N, DCI);
14730   case ISD::FCANONICALIZE:
14731     return performFCanonicalizeCombine(N, DCI);
14732   case AMDGPUISD::RCP:
14733     return performRcpCombine(N, DCI);
14734   case ISD::FLDEXP:
14735   case AMDGPUISD::FRACT:
14736   case AMDGPUISD::RSQ:
14737   case AMDGPUISD::RCP_LEGACY:
14738   case AMDGPUISD::RCP_IFLAG:
14739   case AMDGPUISD::RSQ_CLAMP: {
14740     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14741     SDValue Src = N->getOperand(0);
14742     if (Src.isUndef())
14743       return Src;
14744     break;
14745   }
14746   case ISD::SINT_TO_FP:
14747   case ISD::UINT_TO_FP:
14748     return performUCharToFloatCombine(N, DCI);
14749   case ISD::FCOPYSIGN:
14750     return performFCopySignCombine(N, DCI);
14751   case AMDGPUISD::CVT_F32_UBYTE0:
14752   case AMDGPUISD::CVT_F32_UBYTE1:
14753   case AMDGPUISD::CVT_F32_UBYTE2:
14754   case AMDGPUISD::CVT_F32_UBYTE3:
14755     return performCvtF32UByteNCombine(N, DCI);
14756   case AMDGPUISD::FMED3:
14757     return performFMed3Combine(N, DCI);
14758   case AMDGPUISD::CVT_PKRTZ_F16_F32:
14759     return performCvtPkRTZCombine(N, DCI);
14760   case AMDGPUISD::CLAMP:
14761     return performClampCombine(N, DCI);
14762   case ISD::SCALAR_TO_VECTOR: {
14763     SelectionDAG &DAG = DCI.DAG;
14764     EVT VT = N->getValueType(0);
14765 
14766     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14767     if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14768       SDLoc SL(N);
14769       SDValue Src = N->getOperand(0);
14770       EVT EltVT = Src.getValueType();
14771       if (EltVT != MVT::i16)
14772         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14773 
14774       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14775       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14776     }
14777 
14778     break;
14779   }
14780   case ISD::EXTRACT_VECTOR_ELT:
14781     return performExtractVectorEltCombine(N, DCI);
14782   case ISD::INSERT_VECTOR_ELT:
14783     return performInsertVectorEltCombine(N, DCI);
14784   case ISD::FP_ROUND:
14785     return performFPRoundCombine(N, DCI);
14786   case ISD::LOAD: {
14787     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14788       return Widened;
14789     [[fallthrough]];
14790   }
14791   default: {
14792     if (!DCI.isBeforeLegalize()) {
14793       if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14794         return performMemSDNodeCombine(MemNode, DCI);
14795     }
14796 
14797     break;
14798   }
14799   }
14800 
14801   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
14802 }
14803 
14804 /// Helper function for adjustWritemask
14805 static unsigned SubIdx2Lane(unsigned Idx) {
14806   switch (Idx) {
14807   default: return ~0u;
14808   case AMDGPU::sub0: return 0;
14809   case AMDGPU::sub1: return 1;
14810   case AMDGPU::sub2: return 2;
14811   case AMDGPU::sub3: return 3;
14812   case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14813   }
14814 }
14815 
14816 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14817 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14818                                           SelectionDAG &DAG) const {
14819   unsigned Opcode = Node->getMachineOpcode();
14820 
14821   // Subtract 1 because the vdata output is not a MachineSDNode operand.
14822   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14823   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14824     return Node; // not implemented for D16
14825 
14826   SDNode *Users[5] = { nullptr };
14827   unsigned Lane = 0;
14828   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14829   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14830   unsigned NewDmask = 0;
14831   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14832   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14833   bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14834                   (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14835                      ? true
14836                      : false;
14837   unsigned TFCLane = 0;
14838   bool HasChain = Node->getNumValues() > 1;
14839 
14840   if (OldDmask == 0) {
14841     // These are folded out, but on the chance it happens don't assert.
14842     return Node;
14843   }
14844 
14845   unsigned OldBitsSet = llvm::popcount(OldDmask);
14846   // Work out which is the TFE/LWE lane if that is enabled.
14847   if (UsesTFC) {
14848     TFCLane = OldBitsSet;
14849   }
14850 
14851   // Try to figure out the used register components
14852   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14853        I != E; ++I) {
14854 
14855     // Don't look at users of the chain.
14856     if (I.getUse().getResNo() != 0)
14857       continue;
14858 
14859     // Abort if we can't understand the usage
14860     if (!I->isMachineOpcode() ||
14861         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14862       return Node;
14863 
14864     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14865     // Note that subregs are packed, i.e. Lane==0 is the first bit set
14866     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14867     // set, etc.
14868     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14869     if (Lane == ~0u)
14870       return Node;
14871 
14872     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14873     if (UsesTFC && Lane == TFCLane) {
14874       Users[Lane] = *I;
14875     } else {
14876       // Set which texture component corresponds to the lane.
14877       unsigned Comp;
14878       for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14879         Comp = llvm::countr_zero(Dmask);
14880         Dmask &= ~(1 << Comp);
14881       }
14882 
14883       // Abort if we have more than one user per component.
14884       if (Users[Lane])
14885         return Node;
14886 
14887       Users[Lane] = *I;
14888       NewDmask |= 1 << Comp;
14889     }
14890   }
14891 
14892   // Don't allow 0 dmask, as hardware assumes one channel enabled.
14893   bool NoChannels = !NewDmask;
14894   if (NoChannels) {
14895     if (!UsesTFC) {
14896       // No uses of the result and not using TFC. Then do nothing.
14897       return Node;
14898     }
14899     // If the original dmask has one channel - then nothing to do
14900     if (OldBitsSet == 1)
14901       return Node;
14902     // Use an arbitrary dmask - required for the instruction to work
14903     NewDmask = 1;
14904   }
14905   // Abort if there's no change
14906   if (NewDmask == OldDmask)
14907     return Node;
14908 
14909   unsigned BitsSet = llvm::popcount(NewDmask);
14910 
14911   // Check for TFE or LWE - increase the number of channels by one to account
14912   // for the extra return value
14913   // This will need adjustment for D16 if this is also included in
14914   // adjustWriteMask (this function) but at present D16 are excluded.
14915   unsigned NewChannels = BitsSet + UsesTFC;
14916 
14917   int NewOpcode =
14918       AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14919   assert(NewOpcode != -1 &&
14920          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14921          "failed to find equivalent MIMG op");
14922 
14923   // Adjust the writemask in the node
14924   SmallVector<SDValue, 12> Ops;
14925   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14926   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14927   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14928 
14929   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14930 
14931   MVT ResultVT = NewChannels == 1 ?
14932     SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14933                            NewChannels == 5 ? 8 : NewChannels);
14934   SDVTList NewVTList = HasChain ?
14935     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14936 
14937 
14938   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14939                                               NewVTList, Ops);
14940 
14941   if (HasChain) {
14942     // Update chain.
14943     DAG.setNodeMemRefs(NewNode, Node->memoperands());
14944     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14945   }
14946 
14947   if (NewChannels == 1) {
14948     assert(Node->hasNUsesOfValue(1, 0));
14949     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14950                                       SDLoc(Node), Users[Lane]->getValueType(0),
14951                                       SDValue(NewNode, 0));
14952     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14953     return nullptr;
14954   }
14955 
14956   // Update the users of the node with the new indices
14957   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14958     SDNode *User = Users[i];
14959     if (!User) {
14960       // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14961       // Users[0] is still nullptr because channel 0 doesn't really have a use.
14962       if (i || !NoChannels)
14963         continue;
14964     } else {
14965       SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14966       SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14967       if (NewUser != User) {
14968         DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14969         DAG.RemoveDeadNode(User);
14970       }
14971     }
14972 
14973     switch (Idx) {
14974     default: break;
14975     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14976     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14977     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14978     case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14979     }
14980   }
14981 
14982   DAG.RemoveDeadNode(Node);
14983   return nullptr;
14984 }
14985 
14986 static bool isFrameIndexOp(SDValue Op) {
14987   if (Op.getOpcode() == ISD::AssertZext)
14988     Op = Op.getOperand(0);
14989 
14990   return isa<FrameIndexSDNode>(Op);
14991 }
14992 
14993 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
14994 /// with frame index operands.
14995 /// LLVM assumes that inputs are to these instructions are registers.
14996 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
14997                                                         SelectionDAG &DAG) const {
14998   if (Node->getOpcode() == ISD::CopyToReg) {
14999     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15000     SDValue SrcVal = Node->getOperand(2);
15001 
15002     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15003     // to try understanding copies to physical registers.
15004     if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15005       SDLoc SL(Node);
15006       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15007       SDValue VReg = DAG.getRegister(
15008         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15009 
15010       SDNode *Glued = Node->getGluedNode();
15011       SDValue ToVReg
15012         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15013                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15014       SDValue ToResultReg
15015         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15016                            VReg, ToVReg.getValue(1));
15017       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15018       DAG.RemoveDeadNode(Node);
15019       return ToResultReg.getNode();
15020     }
15021   }
15022 
15023   SmallVector<SDValue, 8> Ops;
15024   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15025     if (!isFrameIndexOp(Node->getOperand(i))) {
15026       Ops.push_back(Node->getOperand(i));
15027       continue;
15028     }
15029 
15030     SDLoc DL(Node);
15031     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15032                                      Node->getOperand(i).getValueType(),
15033                                      Node->getOperand(i)), 0));
15034   }
15035 
15036   return DAG.UpdateNodeOperands(Node, Ops);
15037 }
15038 
15039 /// Fold the instructions after selecting them.
15040 /// Returns null if users were already updated.
15041 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
15042                                           SelectionDAG &DAG) const {
15043   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15044   unsigned Opcode = Node->getMachineOpcode();
15045 
15046   if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15047       !TII->isGather4(Opcode) &&
15048       AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15049     return adjustWritemask(Node, DAG);
15050   }
15051 
15052   if (Opcode == AMDGPU::INSERT_SUBREG ||
15053       Opcode == AMDGPU::REG_SEQUENCE) {
15054     legalizeTargetIndependentNode(Node, DAG);
15055     return Node;
15056   }
15057 
15058   switch (Opcode) {
15059   case AMDGPU::V_DIV_SCALE_F32_e64:
15060   case AMDGPU::V_DIV_SCALE_F64_e64: {
15061     // Satisfy the operand register constraint when one of the inputs is
15062     // undefined. Ordinarily each undef value will have its own implicit_def of
15063     // a vreg, so force these to use a single register.
15064     SDValue Src0 = Node->getOperand(1);
15065     SDValue Src1 = Node->getOperand(3);
15066     SDValue Src2 = Node->getOperand(5);
15067 
15068     if ((Src0.isMachineOpcode() &&
15069          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15070         (Src0 == Src1 || Src0 == Src2))
15071       break;
15072 
15073     MVT VT = Src0.getValueType().getSimpleVT();
15074     const TargetRegisterClass *RC =
15075         getRegClassFor(VT, Src0.getNode()->isDivergent());
15076 
15077     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15078     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15079 
15080     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15081                                       UndefReg, Src0, SDValue());
15082 
15083     // src0 must be the same register as src1 or src2, even if the value is
15084     // undefined, so make sure we don't violate this constraint.
15085     if (Src0.isMachineOpcode() &&
15086         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15087       if (Src1.isMachineOpcode() &&
15088           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15089         Src0 = Src1;
15090       else if (Src2.isMachineOpcode() &&
15091                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15092         Src0 = Src2;
15093       else {
15094         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15095         Src0 = UndefReg;
15096         Src1 = UndefReg;
15097       }
15098     } else
15099       break;
15100 
15101     SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15102     Ops[1] = Src0;
15103     Ops[3] = Src1;
15104     Ops[5] = Src2;
15105     Ops.push_back(ImpDef.getValue(1));
15106     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15107   }
15108   default:
15109     break;
15110   }
15111 
15112   return Node;
15113 }
15114 
15115 // Any MIMG instructions that use tfe or lwe require an initialization of the
15116 // result register that will be written in the case of a memory access failure.
15117 // The required code is also added to tie this init code to the result of the
15118 // img instruction.
15119 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
15120   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15121   const SIRegisterInfo &TRI = TII->getRegisterInfo();
15122   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15123   MachineBasicBlock &MBB = *MI.getParent();
15124 
15125   int DstIdx =
15126       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15127   unsigned InitIdx = 0;
15128 
15129   if (TII->isImage(MI)) {
15130     MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15131     MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15132     MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15133 
15134     if (!TFE && !LWE) // intersect_ray
15135       return;
15136 
15137     unsigned TFEVal = TFE ? TFE->getImm() : 0;
15138     unsigned LWEVal = LWE ? LWE->getImm() : 0;
15139     unsigned D16Val = D16 ? D16->getImm() : 0;
15140 
15141     if (!TFEVal && !LWEVal)
15142       return;
15143 
15144     // At least one of TFE or LWE are non-zero
15145     // We have to insert a suitable initialization of the result value and
15146     // tie this to the dest of the image instruction.
15147 
15148     // Calculate which dword we have to initialize to 0.
15149     MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15150 
15151     // check that dmask operand is found.
15152     assert(MO_Dmask && "Expected dmask operand in instruction");
15153 
15154     unsigned dmask = MO_Dmask->getImm();
15155     // Determine the number of active lanes taking into account the
15156     // Gather4 special case
15157     unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15158 
15159     bool Packed = !Subtarget->hasUnpackedD16VMem();
15160 
15161     InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15162 
15163     // Abandon attempt if the dst size isn't large enough
15164     // - this is in fact an error but this is picked up elsewhere and
15165     // reported correctly.
15166     uint32_t DstSize =
15167         TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15168     if (DstSize < InitIdx)
15169       return;
15170   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15171     InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15172   } else {
15173     return;
15174   }
15175 
15176   const DebugLoc &DL = MI.getDebugLoc();
15177 
15178   // Create a register for the initialization value.
15179   Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15180   unsigned NewDst = 0; // Final initialized value will be in here
15181 
15182   // If PRTStrictNull feature is enabled (the default) then initialize
15183   // all the result registers to 0, otherwise just the error indication
15184   // register (VGPRn+1)
15185   unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15186   unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15187 
15188   BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15189   for (; SizeLeft; SizeLeft--, CurrIdx++) {
15190     NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15191     // Initialize dword
15192     Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15193     BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15194       .addImm(0);
15195     // Insert into the super-reg
15196     BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15197       .addReg(PrevDst)
15198       .addReg(SubReg)
15199       .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
15200 
15201     PrevDst = NewDst;
15202   }
15203 
15204   // Add as an implicit operand
15205   MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15206 
15207   // Tie the just added implicit operand to the dst
15208   MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15209 }
15210 
15211 /// Assign the register class depending on the number of
15212 /// bits set in the writemask
15213 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
15214                                                      SDNode *Node) const {
15215   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15216 
15217   MachineFunction *MF = MI.getParent()->getParent();
15218   MachineRegisterInfo &MRI = MF->getRegInfo();
15219   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15220 
15221   if (TII->isVOP3(MI.getOpcode())) {
15222     // Make sure constant bus requirements are respected.
15223     TII->legalizeOperandsVOP3(MRI, MI);
15224 
15225     // Prefer VGPRs over AGPRs in mAI instructions where possible.
15226     // This saves a chain-copy of registers and better balance register
15227     // use between vgpr and agpr as agpr tuples tend to be big.
15228     if (!MI.getDesc().operands().empty()) {
15229       unsigned Opc = MI.getOpcode();
15230       bool HasAGPRs = Info->mayNeedAGPRs();
15231       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15232       int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15233       for (auto I :
15234            {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15235             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15236         if (I == -1)
15237           break;
15238         if ((I == Src2Idx) && (HasAGPRs))
15239           break;
15240         MachineOperand &Op = MI.getOperand(I);
15241         if (!Op.isReg() || !Op.getReg().isVirtual())
15242           continue;
15243         auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15244         if (!TRI->hasAGPRs(RC))
15245           continue;
15246         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15247         if (!Src || !Src->isCopy() ||
15248             !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15249           continue;
15250         auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15251         // All uses of agpr64 and agpr32 can also accept vgpr except for
15252         // v_accvgpr_read, but we do not produce agpr reads during selection,
15253         // so no use checks are needed.
15254         MRI.setRegClass(Op.getReg(), NewRC);
15255       }
15256 
15257       if (!HasAGPRs)
15258         return;
15259 
15260       // Resolve the rest of AV operands to AGPRs.
15261       if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15262         if (Src2->isReg() && Src2->getReg().isVirtual()) {
15263           auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15264           if (TRI->isVectorSuperClass(RC)) {
15265             auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15266             MRI.setRegClass(Src2->getReg(), NewRC);
15267             if (Src2->isTied())
15268               MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15269           }
15270         }
15271       }
15272     }
15273 
15274     return;
15275   }
15276 
15277   if (TII->isImage(MI))
15278     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15279 }
15280 
15281 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
15282                               uint64_t Val) {
15283   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15284   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15285 }
15286 
15287 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
15288                                                 const SDLoc &DL,
15289                                                 SDValue Ptr) const {
15290   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15291 
15292   // Build the half of the subregister with the constants before building the
15293   // full 128-bit register. If we are building multiple resource descriptors,
15294   // this will allow CSEing of the 2-component register.
15295   const SDValue Ops0[] = {
15296     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15297     buildSMovImm32(DAG, DL, 0),
15298     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15299     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15300     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15301   };
15302 
15303   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15304                                                 MVT::v2i32, Ops0), 0);
15305 
15306   // Combine the constants and the pointer.
15307   const SDValue Ops1[] = {
15308     DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15309     Ptr,
15310     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15311     SubRegHi,
15312     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15313   };
15314 
15315   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15316 }
15317 
15318 /// Return a resource descriptor with the 'Add TID' bit enabled
15319 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15320 ///        of the resource descriptor) to create an offset, which is added to
15321 ///        the resource pointer.
15322 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
15323                                            SDValue Ptr, uint32_t RsrcDword1,
15324                                            uint64_t RsrcDword2And3) const {
15325   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15326   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15327   if (RsrcDword1) {
15328     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15329                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15330                     0);
15331   }
15332 
15333   SDValue DataLo = buildSMovImm32(DAG, DL,
15334                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15335   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15336 
15337   const SDValue Ops[] = {
15338     DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15339     PtrLo,
15340     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15341     PtrHi,
15342     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15343     DataLo,
15344     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15345     DataHi,
15346     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15347   };
15348 
15349   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15350 }
15351 
15352 //===----------------------------------------------------------------------===//
15353 //                         SI Inline Assembly Support
15354 //===----------------------------------------------------------------------===//
15355 
15356 std::pair<unsigned, const TargetRegisterClass *>
15357 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
15358                                                StringRef Constraint,
15359                                                MVT VT) const {
15360   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15361 
15362   const TargetRegisterClass *RC = nullptr;
15363   if (Constraint.size() == 1) {
15364     const unsigned BitWidth = VT.getSizeInBits();
15365     switch (Constraint[0]) {
15366     default:
15367       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15368     case 's':
15369     case 'r':
15370       switch (BitWidth) {
15371       case 16:
15372         RC = &AMDGPU::SReg_32RegClass;
15373         break;
15374       case 64:
15375         RC = &AMDGPU::SGPR_64RegClass;
15376         break;
15377       default:
15378         RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
15379         if (!RC)
15380           return std::pair(0U, nullptr);
15381         break;
15382       }
15383       break;
15384     case 'v':
15385       switch (BitWidth) {
15386       case 16:
15387         RC = &AMDGPU::VGPR_32RegClass;
15388         break;
15389       default:
15390         RC = TRI->getVGPRClassForBitWidth(BitWidth);
15391         if (!RC)
15392           return std::pair(0U, nullptr);
15393         break;
15394       }
15395       break;
15396     case 'a':
15397       if (!Subtarget->hasMAIInsts())
15398         break;
15399       switch (BitWidth) {
15400       case 16:
15401         RC = &AMDGPU::AGPR_32RegClass;
15402         break;
15403       default:
15404         RC = TRI->getAGPRClassForBitWidth(BitWidth);
15405         if (!RC)
15406           return std::pair(0U, nullptr);
15407         break;
15408       }
15409       break;
15410     }
15411     // We actually support i128, i16 and f16 as inline parameters
15412     // even if they are not reported as legal
15413     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15414                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15415       return std::pair(0U, RC);
15416   }
15417 
15418   if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15419     StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15420     if (RegName.consume_front("v")) {
15421       RC = &AMDGPU::VGPR_32RegClass;
15422     } else if (RegName.consume_front("s")) {
15423       RC = &AMDGPU::SGPR_32RegClass;
15424     } else if (RegName.consume_front("a")) {
15425       RC = &AMDGPU::AGPR_32RegClass;
15426     }
15427 
15428     if (RC) {
15429       uint32_t Idx;
15430       if (RegName.consume_front("[")) {
15431         uint32_t End;
15432         bool Failed = RegName.consumeInteger(10, Idx);
15433         Failed |= !RegName.consume_front(":");
15434         Failed |= RegName.consumeInteger(10, End);
15435         Failed |= !RegName.consume_back("]");
15436         if (!Failed) {
15437           uint32_t Width = (End - Idx + 1) * 32;
15438           MCRegister Reg = RC->getRegister(Idx);
15439           if (SIRegisterInfo::isVGPRClass(RC))
15440             RC = TRI->getVGPRClassForBitWidth(Width);
15441           else if (SIRegisterInfo::isSGPRClass(RC))
15442             RC = TRI->getSGPRClassForBitWidth(Width);
15443           else if (SIRegisterInfo::isAGPRClass(RC))
15444             RC = TRI->getAGPRClassForBitWidth(Width);
15445           if (RC) {
15446             Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15447             return std::pair(Reg, RC);
15448           }
15449         }
15450       } else {
15451         bool Failed = RegName.getAsInteger(10, Idx);
15452         if (!Failed && Idx < RC->getNumRegs())
15453           return std::pair(RC->getRegister(Idx), RC);
15454       }
15455     }
15456   }
15457 
15458   auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15459   if (Ret.first)
15460     Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15461 
15462   return Ret;
15463 }
15464 
15465 static bool isImmConstraint(StringRef Constraint) {
15466   if (Constraint.size() == 1) {
15467     switch (Constraint[0]) {
15468     default: break;
15469     case 'I':
15470     case 'J':
15471     case 'A':
15472     case 'B':
15473     case 'C':
15474       return true;
15475     }
15476   } else if (Constraint == "DA" ||
15477              Constraint == "DB") {
15478     return true;
15479   }
15480   return false;
15481 }
15482 
15483 SITargetLowering::ConstraintType
15484 SITargetLowering::getConstraintType(StringRef Constraint) const {
15485   if (Constraint.size() == 1) {
15486     switch (Constraint[0]) {
15487     default: break;
15488     case 's':
15489     case 'v':
15490     case 'a':
15491       return C_RegisterClass;
15492     }
15493   }
15494   if (isImmConstraint(Constraint)) {
15495     return C_Other;
15496   }
15497   return TargetLowering::getConstraintType(Constraint);
15498 }
15499 
15500 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15501   if (!AMDGPU::isInlinableIntLiteral(Val)) {
15502     Val = Val & maskTrailingOnes<uint64_t>(Size);
15503   }
15504   return Val;
15505 }
15506 
15507 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15508                                                     StringRef Constraint,
15509                                                     std::vector<SDValue> &Ops,
15510                                                     SelectionDAG &DAG) const {
15511   if (isImmConstraint(Constraint)) {
15512     uint64_t Val;
15513     if (getAsmOperandConstVal(Op, Val) &&
15514         checkAsmConstraintVal(Op, Constraint, Val)) {
15515       Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15516       Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15517     }
15518   } else {
15519     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15520   }
15521 }
15522 
15523 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
15524   unsigned Size = Op.getScalarValueSizeInBits();
15525   if (Size > 64)
15526     return false;
15527 
15528   if (Size == 16 && !Subtarget->has16BitInsts())
15529     return false;
15530 
15531   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15532     Val = C->getSExtValue();
15533     return true;
15534   }
15535   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15536     Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15537     return true;
15538   }
15539   if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15540     if (Size != 16 || Op.getNumOperands() != 2)
15541       return false;
15542     if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15543       return false;
15544     if (ConstantSDNode *C = V->getConstantSplatNode()) {
15545       Val = C->getSExtValue();
15546       return true;
15547     }
15548     if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15549       Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15550       return true;
15551     }
15552   }
15553 
15554   return false;
15555 }
15556 
15557 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
15558                                              uint64_t Val) const {
15559   if (Constraint.size() == 1) {
15560     switch (Constraint[0]) {
15561     case 'I':
15562       return AMDGPU::isInlinableIntLiteral(Val);
15563     case 'J':
15564       return isInt<16>(Val);
15565     case 'A':
15566       return checkAsmConstraintValA(Op, Val);
15567     case 'B':
15568       return isInt<32>(Val);
15569     case 'C':
15570       return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15571              AMDGPU::isInlinableIntLiteral(Val);
15572     default:
15573       break;
15574     }
15575   } else if (Constraint.size() == 2) {
15576     if (Constraint == "DA") {
15577       int64_t HiBits = static_cast<int32_t>(Val >> 32);
15578       int64_t LoBits = static_cast<int32_t>(Val);
15579       return checkAsmConstraintValA(Op, HiBits, 32) &&
15580              checkAsmConstraintValA(Op, LoBits, 32);
15581     }
15582     if (Constraint == "DB") {
15583       return true;
15584     }
15585   }
15586   llvm_unreachable("Invalid asm constraint");
15587 }
15588 
15589 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
15590                                               unsigned MaxSize) const {
15591   unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15592   bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15593   if (Size == 16) {
15594     MVT VT = Op.getSimpleValueType();
15595     switch (VT.SimpleTy) {
15596     default:
15597       return false;
15598     case MVT::i16:
15599       return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15600     case MVT::f16:
15601       return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15602     case MVT::bf16:
15603       return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15604     case MVT::v2i16:
15605       return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15606     case MVT::v2f16:
15607       return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15608     case MVT::v2bf16:
15609       return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15610     }
15611   }
15612   if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15613       (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15614     return true;
15615   return false;
15616 }
15617 
15618 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15619   switch (UnalignedClassID) {
15620   case AMDGPU::VReg_64RegClassID:
15621     return AMDGPU::VReg_64_Align2RegClassID;
15622   case AMDGPU::VReg_96RegClassID:
15623     return AMDGPU::VReg_96_Align2RegClassID;
15624   case AMDGPU::VReg_128RegClassID:
15625     return AMDGPU::VReg_128_Align2RegClassID;
15626   case AMDGPU::VReg_160RegClassID:
15627     return AMDGPU::VReg_160_Align2RegClassID;
15628   case AMDGPU::VReg_192RegClassID:
15629     return AMDGPU::VReg_192_Align2RegClassID;
15630   case AMDGPU::VReg_224RegClassID:
15631     return AMDGPU::VReg_224_Align2RegClassID;
15632   case AMDGPU::VReg_256RegClassID:
15633     return AMDGPU::VReg_256_Align2RegClassID;
15634   case AMDGPU::VReg_288RegClassID:
15635     return AMDGPU::VReg_288_Align2RegClassID;
15636   case AMDGPU::VReg_320RegClassID:
15637     return AMDGPU::VReg_320_Align2RegClassID;
15638   case AMDGPU::VReg_352RegClassID:
15639     return AMDGPU::VReg_352_Align2RegClassID;
15640   case AMDGPU::VReg_384RegClassID:
15641     return AMDGPU::VReg_384_Align2RegClassID;
15642   case AMDGPU::VReg_512RegClassID:
15643     return AMDGPU::VReg_512_Align2RegClassID;
15644   case AMDGPU::VReg_1024RegClassID:
15645     return AMDGPU::VReg_1024_Align2RegClassID;
15646   case AMDGPU::AReg_64RegClassID:
15647     return AMDGPU::AReg_64_Align2RegClassID;
15648   case AMDGPU::AReg_96RegClassID:
15649     return AMDGPU::AReg_96_Align2RegClassID;
15650   case AMDGPU::AReg_128RegClassID:
15651     return AMDGPU::AReg_128_Align2RegClassID;
15652   case AMDGPU::AReg_160RegClassID:
15653     return AMDGPU::AReg_160_Align2RegClassID;
15654   case AMDGPU::AReg_192RegClassID:
15655     return AMDGPU::AReg_192_Align2RegClassID;
15656   case AMDGPU::AReg_256RegClassID:
15657     return AMDGPU::AReg_256_Align2RegClassID;
15658   case AMDGPU::AReg_512RegClassID:
15659     return AMDGPU::AReg_512_Align2RegClassID;
15660   case AMDGPU::AReg_1024RegClassID:
15661     return AMDGPU::AReg_1024_Align2RegClassID;
15662   default:
15663     return -1;
15664   }
15665 }
15666 
15667 // Figure out which registers should be reserved for stack access. Only after
15668 // the function is legalized do we know all of the non-spill stack objects or if
15669 // calls are present.
15670 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
15671   MachineRegisterInfo &MRI = MF.getRegInfo();
15672   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15673   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15674   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15675   const SIInstrInfo *TII = ST.getInstrInfo();
15676 
15677   if (Info->isEntryFunction()) {
15678     // Callable functions have fixed registers used for stack access.
15679     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
15680   }
15681 
15682   // TODO: Move this logic to getReservedRegs()
15683   // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15684   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15685   Register SReg = ST.isWave32()
15686                       ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15687                       : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15688                                                      &AMDGPU::SGPR_64RegClass);
15689   Info->setSGPRForEXECCopy(SReg);
15690 
15691   assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15692                              Info->getStackPtrOffsetReg()));
15693   if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15694     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15695 
15696   // We need to worry about replacing the default register with itself in case
15697   // of MIR testcases missing the MFI.
15698   if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15699     MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15700 
15701   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15702     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15703 
15704   Info->limitOccupancy(MF);
15705 
15706   if (ST.isWave32() && !MF.empty()) {
15707     for (auto &MBB : MF) {
15708       for (auto &MI : MBB) {
15709         TII->fixImplicitOperands(MI);
15710       }
15711     }
15712   }
15713 
15714   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15715   // classes if required. Ideally the register class constraints would differ
15716   // per-subtarget, but there's no easy way to achieve that right now. This is
15717   // not a problem for VGPRs because the correctly aligned VGPR class is implied
15718   // from using them as the register class for legal types.
15719   if (ST.needsAlignedVGPRs()) {
15720     for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15721       const Register Reg = Register::index2VirtReg(I);
15722       const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15723       if (!RC)
15724         continue;
15725       int NewClassID = getAlignedAGPRClassID(RC->getID());
15726       if (NewClassID != -1)
15727         MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15728     }
15729   }
15730 
15731   TargetLoweringBase::finalizeLowering(MF);
15732 }
15733 
15734 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
15735                                                      KnownBits &Known,
15736                                                      const APInt &DemandedElts,
15737                                                      const SelectionDAG &DAG,
15738                                                      unsigned Depth) const {
15739   Known.resetAll();
15740   unsigned Opc = Op.getOpcode();
15741   switch (Opc) {
15742   case ISD::INTRINSIC_WO_CHAIN: {
15743     unsigned IID = Op.getConstantOperandVal(0);
15744     switch (IID) {
15745     case Intrinsic::amdgcn_mbcnt_lo:
15746     case Intrinsic::amdgcn_mbcnt_hi: {
15747       const GCNSubtarget &ST =
15748           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
15749       // These return at most the (wavefront size - 1) + src1
15750       // As long as src1 is an immediate we can calc known bits
15751       KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15752       unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15753       unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15754       // Cater for potential carry
15755       MaxActiveBits += Src1ValBits ? 1 : 0;
15756       unsigned Size = Op.getValueType().getSizeInBits();
15757       if (MaxActiveBits < Size)
15758         Known.Zero.setHighBits(Size - MaxActiveBits);
15759       return;
15760     }
15761     }
15762     break;
15763   }
15764   }
15765   return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
15766       Op, Known, DemandedElts, DAG, Depth);
15767 }
15768 
15769 void SITargetLowering::computeKnownBitsForFrameIndex(
15770   const int FI, KnownBits &Known, const MachineFunction &MF) const {
15771   TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
15772 
15773   // Set the high bits to zero based on the maximum allowed scratch size per
15774   // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15775   // calculation won't overflow, so assume the sign bit is never set.
15776   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15777 }
15778 
15779 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
15780                                    KnownBits &Known, unsigned Dim) {
15781   unsigned MaxValue =
15782       ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15783   Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15784 }
15785 
15786 void SITargetLowering::computeKnownBitsForTargetInstr(
15787     GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15788     const MachineRegisterInfo &MRI, unsigned Depth) const {
15789   const MachineInstr *MI = MRI.getVRegDef(R);
15790   switch (MI->getOpcode()) {
15791   case AMDGPU::G_INTRINSIC:
15792   case AMDGPU::G_INTRINSIC_CONVERGENT: {
15793     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15794     case Intrinsic::amdgcn_workitem_id_x:
15795       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15796       break;
15797     case Intrinsic::amdgcn_workitem_id_y:
15798       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15799       break;
15800     case Intrinsic::amdgcn_workitem_id_z:
15801       knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15802       break;
15803     case Intrinsic::amdgcn_mbcnt_lo:
15804     case Intrinsic::amdgcn_mbcnt_hi: {
15805       // These return at most the wavefront size - 1.
15806       unsigned Size = MRI.getType(R).getSizeInBits();
15807       Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15808       break;
15809     }
15810     case Intrinsic::amdgcn_groupstaticsize: {
15811       // We can report everything over the maximum size as 0. We can't report
15812       // based on the actual size because we don't know if it's accurate or not
15813       // at any given point.
15814       Known.Zero.setHighBits(
15815           llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15816       break;
15817     }
15818     }
15819     break;
15820   }
15821   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15822     Known.Zero.setHighBits(24);
15823     break;
15824   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15825     Known.Zero.setHighBits(16);
15826     break;
15827   case AMDGPU::G_AMDGPU_SMED3:
15828   case AMDGPU::G_AMDGPU_UMED3: {
15829     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15830 
15831     KnownBits Known2;
15832     KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15833     if (Known2.isUnknown())
15834       break;
15835 
15836     KnownBits Known1;
15837     KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15838     if (Known1.isUnknown())
15839       break;
15840 
15841     KnownBits Known0;
15842     KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15843     if (Known0.isUnknown())
15844       break;
15845 
15846     // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15847     Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15848     Known.One = Known0.One & Known1.One & Known2.One;
15849     break;
15850   }
15851   }
15852 }
15853 
15854 Align SITargetLowering::computeKnownAlignForTargetInstr(
15855   GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
15856   unsigned Depth) const {
15857   const MachineInstr *MI = MRI.getVRegDef(R);
15858   if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15859     // FIXME: Can this move to generic code? What about the case where the call
15860     // site specifies a lower alignment?
15861     Intrinsic::ID IID = GI->getIntrinsicID();
15862     LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
15863     AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15864     if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15865       return *RetAlign;
15866   }
15867   return Align(1);
15868 }
15869 
15870 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
15871   const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
15872   const Align CacheLineAlign = Align(64);
15873 
15874   // Pre-GFX10 target did not benefit from loop alignment
15875   if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15876       getSubtarget()->hasInstFwdPrefetchBug())
15877     return PrefAlign;
15878 
15879   // On GFX10 I$ is 4 x 64 bytes cache lines.
15880   // By default prefetcher keeps one cache line behind and reads two ahead.
15881   // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15882   // behind and one ahead.
15883   // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15884   // If loop fits 64 bytes it always spans no more than two cache lines and
15885   // does not need an alignment.
15886   // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15887   // Else if loop is less or equal 192 bytes we need two lines behind.
15888 
15889   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15890   const MachineBasicBlock *Header = ML->getHeader();
15891   if (Header->getAlignment() != PrefAlign)
15892     return Header->getAlignment(); // Already processed.
15893 
15894   unsigned LoopSize = 0;
15895   for (const MachineBasicBlock *MBB : ML->blocks()) {
15896     // If inner loop block is aligned assume in average half of the alignment
15897     // size to be added as nops.
15898     if (MBB != Header)
15899       LoopSize += MBB->getAlignment().value() / 2;
15900 
15901     for (const MachineInstr &MI : *MBB) {
15902       LoopSize += TII->getInstSizeInBytes(MI);
15903       if (LoopSize > 192)
15904         return PrefAlign;
15905     }
15906   }
15907 
15908   if (LoopSize <= 64)
15909     return PrefAlign;
15910 
15911   if (LoopSize <= 128)
15912     return CacheLineAlign;
15913 
15914   // If any of parent loops is surrounded by prefetch instructions do not
15915   // insert new for inner loop, which would reset parent's settings.
15916   for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15917     if (MachineBasicBlock *Exit = P->getExitBlock()) {
15918       auto I = Exit->getFirstNonDebugInstr();
15919       if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15920         return CacheLineAlign;
15921     }
15922   }
15923 
15924   MachineBasicBlock *Pre = ML->getLoopPreheader();
15925   MachineBasicBlock *Exit = ML->getExitBlock();
15926 
15927   if (Pre && Exit) {
15928     auto PreTerm = Pre->getFirstTerminator();
15929     if (PreTerm == Pre->begin() ||
15930         std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15931       BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15932           .addImm(1); // prefetch 2 lines behind PC
15933 
15934     auto ExitHead = Exit->getFirstNonDebugInstr();
15935     if (ExitHead == Exit->end() ||
15936         ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15937       BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15938           .addImm(2); // prefetch 1 line behind PC
15939   }
15940 
15941   return CacheLineAlign;
15942 }
15943 
15944 LLVM_ATTRIBUTE_UNUSED
15945 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15946   assert(N->getOpcode() == ISD::CopyFromReg);
15947   do {
15948     // Follow the chain until we find an INLINEASM node.
15949     N = N->getOperand(0).getNode();
15950     if (N->getOpcode() == ISD::INLINEASM ||
15951         N->getOpcode() == ISD::INLINEASM_BR)
15952       return true;
15953   } while (N->getOpcode() == ISD::CopyFromReg);
15954   return false;
15955 }
15956 
15957 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
15958                                                   FunctionLoweringInfo *FLI,
15959                                                   UniformityInfo *UA) const {
15960   switch (N->getOpcode()) {
15961   case ISD::CopyFromReg: {
15962     const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15963     const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15964     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15965     Register Reg = R->getReg();
15966 
15967     // FIXME: Why does this need to consider isLiveIn?
15968     if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15969       return !TRI->isSGPRReg(MRI, Reg);
15970 
15971     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15972       return UA->isDivergent(V);
15973 
15974     assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
15975     return !TRI->isSGPRReg(MRI, Reg);
15976   }
15977   case ISD::LOAD: {
15978     const LoadSDNode *L = cast<LoadSDNode>(N);
15979     unsigned AS = L->getAddressSpace();
15980     // A flat load may access private memory.
15981     return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
15982   }
15983   case ISD::CALLSEQ_END:
15984     return true;
15985   case ISD::INTRINSIC_WO_CHAIN:
15986     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15987   case ISD::INTRINSIC_W_CHAIN:
15988     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15989   case AMDGPUISD::ATOMIC_CMP_SWAP:
15990   case AMDGPUISD::BUFFER_ATOMIC_SWAP:
15991   case AMDGPUISD::BUFFER_ATOMIC_ADD:
15992   case AMDGPUISD::BUFFER_ATOMIC_SUB:
15993   case AMDGPUISD::BUFFER_ATOMIC_SMIN:
15994   case AMDGPUISD::BUFFER_ATOMIC_UMIN:
15995   case AMDGPUISD::BUFFER_ATOMIC_SMAX:
15996   case AMDGPUISD::BUFFER_ATOMIC_UMAX:
15997   case AMDGPUISD::BUFFER_ATOMIC_AND:
15998   case AMDGPUISD::BUFFER_ATOMIC_OR:
15999   case AMDGPUISD::BUFFER_ATOMIC_XOR:
16000   case AMDGPUISD::BUFFER_ATOMIC_INC:
16001   case AMDGPUISD::BUFFER_ATOMIC_DEC:
16002   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
16003   case AMDGPUISD::BUFFER_ATOMIC_CSUB:
16004   case AMDGPUISD::BUFFER_ATOMIC_FADD:
16005   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
16006   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
16007     // Target-specific read-modify-write atomics are sources of divergence.
16008     return true;
16009   default:
16010     if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16011       // Generic read-modify-write atomics are sources of divergence.
16012       return A->readMem() && A->writeMem();
16013     }
16014     return false;
16015   }
16016 }
16017 
16018 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
16019                                                EVT VT) const {
16020   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16021   case MVT::f32:
16022     return !denormalModeIsFlushAllF32(DAG.getMachineFunction());
16023   case MVT::f64:
16024   case MVT::f16:
16025     return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
16026   default:
16027     return false;
16028   }
16029 }
16030 
16031 bool SITargetLowering::denormalsEnabledForType(
16032     LLT Ty, const MachineFunction &MF) const {
16033   switch (Ty.getScalarSizeInBits()) {
16034   case 32:
16035     return !denormalModeIsFlushAllF32(MF);
16036   case 64:
16037   case 16:
16038     return !denormalModeIsFlushAllF64F16(MF);
16039   default:
16040     return false;
16041   }
16042 }
16043 
16044 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16045                                                     const SelectionDAG &DAG,
16046                                                     bool SNaN,
16047                                                     unsigned Depth) const {
16048   if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16049     const MachineFunction &MF = DAG.getMachineFunction();
16050     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16051 
16052     if (Info->getMode().DX10Clamp)
16053       return true; // Clamped to 0.
16054     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16055   }
16056 
16057   return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
16058                                                             SNaN, Depth);
16059 }
16060 
16061 #if 0
16062 // FIXME: This should be checked before unsafe fp atomics are enabled
16063 // Global FP atomic instructions have a hardcoded FP mode and do not support
16064 // FP32 denormals, and only support v2f16 denormals.
16065 static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16066   const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067   auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16068   if (&Flt == &APFloat::IEEEsingle())
16069     return DenormMode == DenormalMode::getPreserveSign();
16070   return DenormMode == DenormalMode::getIEEE();
16071 }
16072 #endif
16073 
16074 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16075 // floating point atomic instructions. May generate more efficient code,
16076 // but may not respect rounding and denormal modes, and may give incorrect
16077 // results for certain memory destinations.
16078 bool unsafeFPAtomicsDisabled(Function *F) {
16079   return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16080          "true";
16081 }
16082 
16083 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
16084   LLVMContext &Ctx = RMW->getContext();
16085   SmallVector<StringRef> SSNs;
16086   Ctx.getSyncScopeNames(SSNs);
16087   StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16088                            ? "system"
16089                            : SSNs[RMW->getSyncScopeID()];
16090 
16091   return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16092          << "Hardware instruction generated for atomic "
16093          << RMW->getOperationName(RMW->getOperation())
16094          << " operation at memory scope " << MemScope;
16095 }
16096 
16097 static bool isHalf2OrBFloat2(Type *Ty) {
16098   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16099     Type *EltTy = VT->getElementType();
16100     return VT->getNumElements() == 2 &&
16101            (EltTy->isHalfTy() || EltTy->isBFloatTy());
16102   }
16103 
16104   return false;
16105 }
16106 
16107 static bool isHalf2(Type *Ty) {
16108   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16109   return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16110 }
16111 
16112 static bool isBFloat2(Type *Ty) {
16113   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16114   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16115 }
16116 
16117 TargetLowering::AtomicExpansionKind
16118 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16119   unsigned AS = RMW->getPointerAddressSpace();
16120   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16121     return AtomicExpansionKind::NotAtomic;
16122 
16123   auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16124     OptimizationRemarkEmitter ORE(RMW->getFunction());
16125     ORE.emit([=]() {
16126       return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16127     });
16128     return Kind;
16129   };
16130 
16131   auto SSID = RMW->getSyncScopeID();
16132   bool HasSystemScope =
16133       SSID == SyncScope::System ||
16134       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16135 
16136   switch (RMW->getOperation()) {
16137   case AtomicRMWInst::Sub:
16138   case AtomicRMWInst::Or:
16139   case AtomicRMWInst::Xor: {
16140     // Atomic sub/or/xor do not work over PCI express, but atomic add
16141     // does. InstCombine transforms these with 0 to or, so undo that.
16142     if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16143       if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16144           ConstVal && ConstVal->isNullValue())
16145         return AtomicExpansionKind::Expand;
16146     }
16147 
16148     break;
16149   }
16150   case AtomicRMWInst::FAdd: {
16151     Type *Ty = RMW->getType();
16152 
16153     // TODO: Handle REGION_ADDRESS
16154     if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16155       // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16156       // is fixed to round-to-nearest-even.
16157       //
16158       // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16159       // round-to-nearest-even.
16160       //
16161       // We ignore the rounding mode problem, even in strictfp. The C++ standard
16162       // suggests it is OK if the floating-point mode may not match the calling
16163       // thread.
16164       if (Ty->isFloatTy()) {
16165         return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16166                                                  : AtomicExpansionKind::CmpXChg;
16167       }
16168 
16169       if (Ty->isDoubleTy()) {
16170         // Ignores denormal mode, but we don't consider flushing mandatory.
16171         return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16172                                                  : AtomicExpansionKind::CmpXChg;
16173       }
16174 
16175       if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16176         return AtomicExpansionKind::None;
16177 
16178       return AtomicExpansionKind::CmpXChg;
16179     }
16180 
16181     if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16182         AS != AMDGPUAS::BUFFER_FAT_POINTER)
16183       return AtomicExpansionKind::CmpXChg;
16184 
16185     if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16186       return AtomicExpansionKind::None;
16187 
16188     if (AS == AMDGPUAS::FLAT_ADDRESS) {
16189       // gfx940, gfx12
16190       // FIXME: Needs to account for no fine-grained memory
16191       if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16192         return AtomicExpansionKind::None;
16193     } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16194       // gfx90a, gfx940, gfx12
16195       // FIXME: Needs to account for no fine-grained memory
16196       if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16197         return AtomicExpansionKind::None;
16198 
16199       // gfx940, gfx12
16200       // FIXME: Needs to account for no fine-grained memory
16201       if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16202         return AtomicExpansionKind::None;
16203     } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16204       // gfx90a, gfx940, gfx12
16205       // FIXME: Needs to account for no fine-grained memory
16206       if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16207         return AtomicExpansionKind::None;
16208 
16209       // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16210       // buffer. gfx12 does have the buffer version.
16211       if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16212         return AtomicExpansionKind::None;
16213     }
16214 
16215     if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16216       return AtomicExpansionKind::CmpXChg;
16217 
16218     // Always expand system scope fp atomics.
16219     if (HasSystemScope)
16220       return AtomicExpansionKind::CmpXChg;
16221 
16222     // global and flat atomic fadd f64: gfx90a, gfx940.
16223     if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16224       return ReportUnsafeHWInst(AtomicExpansionKind::None);
16225 
16226     if (AS != AMDGPUAS::FLAT_ADDRESS) {
16227       if (Ty->isFloatTy()) {
16228         // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16229         if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16230           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16231         // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16232         if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16233           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16234       } else {
16235         // gfx908
16236         if (RMW->use_empty() &&
16237             Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16238           return ReportUnsafeHWInst(AtomicExpansionKind::None);
16239       }
16240     }
16241 
16242     // flat atomic fadd f32: gfx940, gfx11+.
16243     if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244       if (Subtarget->hasFlatAtomicFaddF32Inst())
16245         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16246 
16247       // If it is in flat address space, and the type is float, we will try to
16248       // expand it, if the target supports global and lds atomic fadd. The
16249       // reason we need that is, in the expansion, we emit the check of address
16250       // space. If it is in global address space, we emit the global atomic
16251       // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16252       if (Subtarget->hasLDSFPAtomicAddF32()) {
16253         if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16254           return AtomicExpansionKind::Expand;
16255         if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16256           return AtomicExpansionKind::Expand;
16257       }
16258     }
16259 
16260     return AtomicExpansionKind::CmpXChg;
16261   }
16262   case AtomicRMWInst::FMin:
16263   case AtomicRMWInst::FMax: {
16264     Type *Ty = RMW->getType();
16265 
16266     // LDS float and double fmin/fmax were always supported.
16267     if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16268       return AtomicExpansionKind::None;
16269 
16270     if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16271       return AtomicExpansionKind::CmpXChg;
16272 
16273     // Always expand system scope fp atomics.
16274     if (HasSystemScope)
16275       return AtomicExpansionKind::CmpXChg;
16276 
16277     // For flat and global cases:
16278     // float, double in gfx7. Manual claims denormal support.
16279     // Removed in gfx8.
16280     // float, double restored in gfx10.
16281     // double removed again in gfx11, so only f32 for gfx11/gfx12.
16282     //
16283     // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16284     // f32.
16285     //
16286     // FIXME: Check scope and fine grained memory
16287     if (AS == AMDGPUAS::FLAT_ADDRESS) {
16288       if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16289         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16290       if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16291         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292     } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16293                AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16294       if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16295         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296       if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16297         return ReportUnsafeHWInst(AtomicExpansionKind::None);
16298     }
16299 
16300     return AtomicExpansionKind::CmpXChg;
16301   }
16302   case AtomicRMWInst::Min:
16303   case AtomicRMWInst::Max:
16304   case AtomicRMWInst::UMin:
16305   case AtomicRMWInst::UMax: {
16306     if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16307         AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16308       // Always expand system scope min/max atomics.
16309       if (HasSystemScope)
16310         return AtomicExpansionKind::CmpXChg;
16311     }
16312     break;
16313   }
16314   default:
16315     break;
16316   }
16317 
16318   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
16319 }
16320 
16321 TargetLowering::AtomicExpansionKind
16322 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16323   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16324              ? AtomicExpansionKind::NotAtomic
16325              : AtomicExpansionKind::None;
16326 }
16327 
16328 TargetLowering::AtomicExpansionKind
16329 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16330   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16331              ? AtomicExpansionKind::NotAtomic
16332              : AtomicExpansionKind::None;
16333 }
16334 
16335 TargetLowering::AtomicExpansionKind
16336 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16337   return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16338              ? AtomicExpansionKind::NotAtomic
16339              : AtomicExpansionKind::None;
16340 }
16341 
16342 const TargetRegisterClass *
16343 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16344   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
16345   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16346   if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16347     return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16348                                                : &AMDGPU::SReg_32RegClass;
16349   if (!TRI->isSGPRClass(RC) && !isDivergent)
16350     return TRI->getEquivalentSGPRClass(RC);
16351   if (TRI->isSGPRClass(RC) && isDivergent)
16352     return TRI->getEquivalentVGPRClass(RC);
16353 
16354   return RC;
16355 }
16356 
16357 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
16358 // uniform values (as produced by the mask results of control flow intrinsics)
16359 // used outside of divergent blocks. The phi users need to also be treated as
16360 // always uniform.
16361 //
16362 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16363 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16364                       unsigned WaveSize) {
16365   // FIXME: We assume we never cast the mask results of a control flow
16366   // intrinsic.
16367   // Early exit if the type won't be consistent as a compile time hack.
16368   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16369   if (!IT || IT->getBitWidth() != WaveSize)
16370     return false;
16371 
16372   if (!isa<Instruction>(V))
16373     return false;
16374   if (!Visited.insert(V).second)
16375     return false;
16376   bool Result = false;
16377   for (const auto *U : V->users()) {
16378     if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16379       if (V == U->getOperand(1)) {
16380         switch (Intrinsic->getIntrinsicID()) {
16381         default:
16382           Result = false;
16383           break;
16384         case Intrinsic::amdgcn_if_break:
16385         case Intrinsic::amdgcn_if:
16386         case Intrinsic::amdgcn_else:
16387           Result = true;
16388           break;
16389         }
16390       }
16391       if (V == U->getOperand(0)) {
16392         switch (Intrinsic->getIntrinsicID()) {
16393         default:
16394           Result = false;
16395           break;
16396         case Intrinsic::amdgcn_end_cf:
16397         case Intrinsic::amdgcn_loop:
16398           Result = true;
16399           break;
16400         }
16401       }
16402     } else {
16403       Result = hasCFUser(U, Visited, WaveSize);
16404     }
16405     if (Result)
16406       break;
16407   }
16408   return Result;
16409 }
16410 
16411 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
16412                                                const Value *V) const {
16413   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16414     if (CI->isInlineAsm()) {
16415       // FIXME: This cannot give a correct answer. This should only trigger in
16416       // the case where inline asm returns mixed SGPR and VGPR results, used
16417       // outside the defining block. We don't have a specific result to
16418       // consider, so this assumes if any value is SGPR, the overall register
16419       // also needs to be SGPR.
16420       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16421       TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
16422           MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16423       for (auto &TC : TargetConstraints) {
16424         if (TC.Type == InlineAsm::isOutput) {
16425           ComputeConstraintToUse(TC, SDValue());
16426           const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
16427               SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16428           if (RC && SIRI->isSGPRClass(RC))
16429             return true;
16430         }
16431       }
16432     }
16433   }
16434   SmallPtrSet<const Value *, 16> Visited;
16435   return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16436 }
16437 
16438 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
16439   SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16440   for (; I != E; ++I) {
16441     if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16442       if (getBasePtrIndex(M) == I.getOperandNo())
16443         return true;
16444     }
16445   }
16446   return false;
16447 }
16448 
16449 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
16450                                            SDValue N1) const {
16451   if (!N0.hasOneUse())
16452     return false;
16453   // Take care of the opportunity to keep N0 uniform
16454   if (N0->isDivergent() || !N1->isDivergent())
16455     return true;
16456   // Check if we have a good chance to form the memory access pattern with the
16457   // base and offset
16458   return (DAG.isBaseWithConstantOffset(N0) &&
16459           hasMemSDNodeUser(*N0->use_begin()));
16460 }
16461 
16462 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
16463                                            Register N0, Register N1) const {
16464   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16465 }
16466 
16467 MachineMemOperand::Flags
16468 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
16469   // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16470   MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
16471   if (I.getMetadata("amdgpu.noclobber"))
16472     Flags |= MONoClobber;
16473   if (I.getMetadata("amdgpu.last.use"))
16474     Flags |= MOLastUse;
16475   return Flags;
16476 }
16477 
16478 bool SITargetLowering::checkForPhysRegDependency(
16479     SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16480     const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16481   if (User->getOpcode() != ISD::CopyToReg)
16482     return false;
16483   if (!Def->isMachineOpcode())
16484     return false;
16485   MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16486   if (!MDef)
16487     return false;
16488 
16489   unsigned ResNo = User->getOperand(Op).getResNo();
16490   if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16491     return false;
16492   const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16493   if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16494     PhysReg = AMDGPU::SCC;
16495     const TargetRegisterClass *RC =
16496         TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16497     Cost = RC->getCopyCost();
16498     return true;
16499   }
16500   return false;
16501 }
16502 
16503 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16504   AtomicRMWInst::BinOp Op = AI->getOperation();
16505 
16506   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16507       Op == AtomicRMWInst::Xor) {
16508     // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16509     assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16510            "this cannot be replaced with add");
16511     AI->setOperation(AtomicRMWInst::Add);
16512     return;
16513   }
16514 
16515   assert(Subtarget->hasAtomicFaddInsts() &&
16516          "target should have atomic fadd instructions");
16517   assert(AI->getType()->isFloatTy() &&
16518          AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16519          "generic atomicrmw expansion only supports FP32 operand in flat "
16520          "address space");
16521   assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16522 
16523   // Given: atomicrmw fadd ptr %addr, float %val ordering
16524   //
16525   // With this expansion we produce the following code:
16526   //   [...]
16527   //   br label %atomicrmw.check.shared
16528   //
16529   // atomicrmw.check.shared:
16530   //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16531   //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16532   //
16533   // atomicrmw.shared:
16534   //   %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16535   //   %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16536   //                                   float %val ordering
16537   //   br label %atomicrmw.phi
16538   //
16539   // atomicrmw.check.private:
16540   //   %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16541   //   br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16542   //
16543   // atomicrmw.private:
16544   //   %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16545   //   %loaded.private = load float, ptr addrspace(5) %cast.private
16546   //   %val.new = fadd float %loaded.private, %val
16547   //   store float %val.new, ptr addrspace(5) %cast.private
16548   //   br label %atomicrmw.phi
16549   //
16550   // atomicrmw.global:
16551   //   %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16552   //   %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16553   //                                   float %val ordering
16554   //   br label %atomicrmw.phi
16555   //
16556   // atomicrmw.phi:
16557   //   %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16558   //                           [ %loaded.private, %atomicrmw.private ],
16559   //                           [ %loaded.global, %atomicrmw.global ]
16560   //   br label %atomicrmw.end
16561   //
16562   // atomicrmw.end:
16563   //    [...]
16564 
16565   IRBuilder<> Builder(AI);
16566   LLVMContext &Ctx = Builder.getContext();
16567 
16568   BasicBlock *BB = Builder.GetInsertBlock();
16569   Function *F = BB->getParent();
16570   BasicBlock *ExitBB =
16571       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16572   BasicBlock *CheckSharedBB =
16573       BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16574   BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16575   BasicBlock *CheckPrivateBB =
16576       BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16577   BasicBlock *PrivateBB =
16578       BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16579   BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16580   BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16581 
16582   Value *Val = AI->getValOperand();
16583   Type *ValTy = Val->getType();
16584   Value *Addr = AI->getPointerOperand();
16585 
16586   auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16587                                  Value *Val) -> Value * {
16588     AtomicRMWInst *OldVal =
16589         Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16590                                 AI->getOrdering(), AI->getSyncScopeID());
16591     SmallVector<std::pair<unsigned, MDNode *>> MDs;
16592     AI->getAllMetadata(MDs);
16593     for (auto &P : MDs)
16594       OldVal->setMetadata(P.first, P.second);
16595     return OldVal;
16596   };
16597 
16598   std::prev(BB->end())->eraseFromParent();
16599   Builder.SetInsertPoint(BB);
16600   Builder.CreateBr(CheckSharedBB);
16601 
16602   Builder.SetInsertPoint(CheckSharedBB);
16603   CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16604                                                {Addr}, nullptr, "is.shared");
16605   Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16606 
16607   Builder.SetInsertPoint(SharedBB);
16608   Value *CastToLocal = Builder.CreateAddrSpaceCast(
16609       Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16610   Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16611   Builder.CreateBr(PhiBB);
16612 
16613   Builder.SetInsertPoint(CheckPrivateBB);
16614   CallInst *IsPrivate = Builder.CreateIntrinsic(
16615       Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16616   Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16617 
16618   Builder.SetInsertPoint(PrivateBB);
16619   Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16620       Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16621   Value *LoadedPrivate =
16622       Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16623   Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16624   Builder.CreateStore(NewVal, CastToPrivate);
16625   Builder.CreateBr(PhiBB);
16626 
16627   Builder.SetInsertPoint(GlobalBB);
16628   Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16629       Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16630   Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16631   Builder.CreateBr(PhiBB);
16632 
16633   Builder.SetInsertPoint(PhiBB);
16634   PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16635   Loaded->addIncoming(LoadedShared, SharedBB);
16636   Loaded->addIncoming(LoadedPrivate, PrivateBB);
16637   Loaded->addIncoming(LoadedGlobal, GlobalBB);
16638   Builder.CreateBr(ExitBB);
16639 
16640   AI->replaceAllUsesWith(Loaded);
16641   AI->eraseFromParent();
16642 }
16643 
16644 LoadInst *
16645 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16646   IRBuilder<> Builder(AI);
16647   auto Order = AI->getOrdering();
16648 
16649   // The optimization removes store aspect of the atomicrmw. Therefore, cache
16650   // must be flushed if the atomic ordering had a release semantics. This is
16651   // not necessary a fence, a release fence just coincides to do that flush.
16652   // Avoid replacing of an atomicrmw with a release semantics.
16653   if (isReleaseOrStronger(Order))
16654     return nullptr;
16655 
16656   LoadInst *LI = Builder.CreateAlignedLoad(
16657       AI->getType(), AI->getPointerOperand(), AI->getAlign());
16658   LI->setAtomic(Order, AI->getSyncScopeID());
16659   LI->copyMetadata(*AI);
16660   LI->takeName(AI);
16661   AI->replaceAllUsesWith(LI);
16662   AI->eraseFromParent();
16663   return LI;
16664 }
16665