xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/FloatingPointMode.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/UniformityAnalysis.h"
27 #include "llvm/CodeGen/Analysis.h"
28 #include "llvm/CodeGen/ByteProvider.h"
29 #include "llvm/CodeGen/FunctionLoweringInfo.h"
30 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineLoopInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/IntrinsicsAMDGPU.h"
40 #include "llvm/IR/IntrinsicsR600.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/KnownBits.h"
44 #include "llvm/Support/ModRef.h"
45 #include "llvm/Transforms/Utils/LowerAtomic.h"
46 #include <optional>
47 
48 using namespace llvm;
49 
50 #define DEBUG_TYPE "si-lower"
51 
52 STATISTIC(NumTailCalls, "Number of tail calls");
53 
54 static cl::opt<bool>
55     DisableLoopAlignment("amdgpu-disable-loop-alignment",
56                          cl::desc("Do not align and prefetch loops"),
57                          cl::init(false));
58 
59 static cl::opt<bool> UseDivergentRegisterIndexing(
60     "amdgpu-use-divergent-register-indexing", cl::Hidden,
61     cl::desc("Use indirect register addressing for divergent indexes"),
62     cl::init(false));
63 
64 // TODO: This option should be removed once we switch to always using PTRADD in
65 // the SelectionDAG.
66 static cl::opt<bool> UseSelectionDAGPTRADD(
67     "amdgpu-use-sdag-ptradd", cl::Hidden,
68     cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
69              "SelectionDAG ISel"),
70     cl::init(false));
71 
72 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
73   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
74   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
75 }
76 
77 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
78   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
79   return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
80 }
81 
82 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
83   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
84   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
85     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
86       return AMDGPU::SGPR0 + Reg;
87     }
88   }
89   llvm_unreachable("Cannot allocate sgpr");
90 }
91 
92 SITargetLowering::SITargetLowering(const TargetMachine &TM,
93                                    const GCNSubtarget &STI)
94     : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
95   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
96   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
97 
98   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
99   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
100 
101   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
102 
103   const SIRegisterInfo *TRI = STI.getRegisterInfo();
104   const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
105 
106   addRegisterClass(MVT::f64, V64RegClass);
107   addRegisterClass(MVT::v2f32, V64RegClass);
108   addRegisterClass(MVT::Untyped, V64RegClass);
109 
110   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
111   addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
112 
113   addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
114   addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
115 
116   addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
117   addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
118 
119   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
120   addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
121 
122   addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
123   addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
124 
125   addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
126   addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
127 
128   addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
129   addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
130 
131   addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
132   addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
133 
134   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
135   addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
136 
137   addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
138   addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
139 
140   addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
141   addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
142 
143   addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
144   addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
145 
146   addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
147   addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
148 
149   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
150   addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
151 
152   addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
153   addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
154 
155   addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
156   addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
157 
158   if (Subtarget->has16BitInsts()) {
159     if (Subtarget->useRealTrue16Insts()) {
160       addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
161       addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
162       addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
163     } else {
164       addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
165       addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
166       addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
167     }
168 
169     // Unless there are also VOP3P operations, not operations are really legal.
170     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
171     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
172     addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
173     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
174     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
175     addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
176     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
177     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
178     addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
179     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
180     addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
181     addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
182     addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
183     addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
184     addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
185   }
186 
187   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
188   addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
189 
190   computeRegisterProperties(Subtarget->getRegisterInfo());
191 
192   // The boolean content concept here is too inflexible. Compares only ever
193   // really produce a 1-bit result. Any copy/extend from these will turn into a
194   // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
195   // it's what most targets use.
196   setBooleanContents(ZeroOrOneBooleanContent);
197   setBooleanVectorContents(ZeroOrOneBooleanContent);
198 
199   // We need to custom lower vector stores from local memory
200   setOperationAction(ISD::LOAD,
201                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204                       MVT::i1, MVT::v32i32},
205                      Custom);
206 
207   setOperationAction(ISD::STORE,
208                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211                       MVT::i1, MVT::v32i32},
212                      Custom);
213 
214   if (isTypeLegal(MVT::bf16)) {
215     for (unsigned Opc :
216          {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,
217           ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,
218           ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,
219           ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,
220           ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,
221           ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,
222           ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
223           ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
224           ISD::SETCC}) {
225       // FIXME: The promoted to type shouldn't need to be explicit
226       setOperationAction(Opc, MVT::bf16, Promote);
227       AddPromotedToType(Opc, MVT::bf16, MVT::f32);
228     }
229 
230     setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
231 
232     setOperationAction(ISD::SELECT, MVT::bf16, Promote);
233     AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
234 
235     setOperationAction(ISD::FABS, MVT::bf16, Legal);
236     setOperationAction(ISD::FNEG, MVT::bf16, Legal);
237     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
238 
239     // We only need to custom lower because we can't specify an action for bf16
240     // sources.
241     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
242     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
243   }
244 
245   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
246   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
247   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
248   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
249   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
250   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
251   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
252   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
253   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
254   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
255   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
256   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
257   setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
258   setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
259   setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
260   setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
261 
262   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
263   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
264   setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
265   setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
266   setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
267   setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
268   setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
269 
270   setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
271 
272   setOperationAction(ISD::SELECT, MVT::i1, Promote);
273   setOperationAction(ISD::SELECT, MVT::i64, Custom);
274   setOperationAction(ISD::SELECT, MVT::f64, Promote);
275   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
276 
277   setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
278 
279   setOperationAction(ISD::SELECT_CC,
280                      {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
281 
282   setOperationAction(ISD::SETCC, MVT::i1, Promote);
283   setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
284   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
285 
286   setOperationAction(ISD::TRUNCATE,
287                      {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
288                       MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
289                       MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
290                      Expand);
291   setOperationAction(ISD::FP_ROUND,
292                      {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
293                       MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
294                       MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295                      Expand);
296 
297   setOperationAction(ISD::SIGN_EXTEND_INREG,
298                      {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
299                       MVT::v3i16, MVT::v4i16, MVT::Other},
300                      Custom);
301 
302   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
303   setOperationAction(ISD::BR_CC,
304                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
305 
306   setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
307 
308   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
309 
310   setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
311                      Expand);
312 
313 #if 0
314   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
315 #endif
316 
317   // We only support LOAD/STORE and vector manipulation ops for vectors
318   // with > 4 elements.
319   for (MVT VT :
320        {MVT::v8i32,   MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
321         MVT::v10f32,  MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
322         MVT::v16i32,  MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
323         MVT::v4f16,   MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,
324         MVT::v6f32,   MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,
325         MVT::v8i16,   MVT::v8f16,  MVT::v8bf16, MVT::v16i16, MVT::v16f16,
326         MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
327         MVT::v32i16,  MVT::v32f16, MVT::v32bf16}) {
328     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
329       switch (Op) {
330       case ISD::LOAD:
331       case ISD::STORE:
332       case ISD::BUILD_VECTOR:
333       case ISD::BITCAST:
334       case ISD::UNDEF:
335       case ISD::EXTRACT_VECTOR_ELT:
336       case ISD::INSERT_VECTOR_ELT:
337       case ISD::SCALAR_TO_VECTOR:
338       case ISD::IS_FPCLASS:
339         break;
340       case ISD::EXTRACT_SUBVECTOR:
341       case ISD::INSERT_SUBVECTOR:
342       case ISD::CONCAT_VECTORS:
343         setOperationAction(Op, VT, Custom);
344         break;
345       default:
346         setOperationAction(Op, VT, Expand);
347         break;
348       }
349     }
350   }
351 
352   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
353 
354   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
355   // is expanded to avoid having two separate loops in case the index is a VGPR.
356 
357   // Most operations are naturally 32-bit vector operations. We only support
358   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
359   for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
360     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
361     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
362 
363     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
364     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
365 
366     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
367     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
368 
369     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
370     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
371   }
372 
373   for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
374     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
375     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
376 
377     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
378     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
379 
380     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
381     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
382 
383     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
384     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
385   }
386 
387   for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
388     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
389     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
390 
391     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
392     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
393 
394     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
395     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
396 
397     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
398     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
399   }
400 
401   for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
402     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
403     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
404 
405     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
406     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
407 
408     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
409     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
410 
411     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
412     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
413   }
414 
415   for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
416     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
417     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
418 
419     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
420     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
421 
422     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
423     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
424 
425     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
426     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
427   }
428 
429   setOperationAction(ISD::VECTOR_SHUFFLE,
430                      {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
431                       MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
432                      Custom);
433 
434   if (Subtarget->hasPkMovB32()) {
435     // TODO: 16-bit element vectors should be legal with even aligned elements.
436     // TODO: Can be legal with wider source types than the result with
437     // subregister extracts.
438     setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
439   }
440 
441   setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442                      Custom);
443 
444   // Avoid stack access for these.
445   // TODO: Generalize to more vector types.
446   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
447                      {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
448                       MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
449                      Custom);
450 
451   // Deal with vec3 vector operations when widened to vec4.
452   setOperationAction(ISD::INSERT_SUBVECTOR,
453                      {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
454 
455   // Deal with vec5/6/7 vector operations when widened to vec8.
456   setOperationAction(ISD::INSERT_SUBVECTOR,
457                      {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
458                       MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
459                       MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
460                       MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
461                      Custom);
462 
463   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
464   // and output demarshalling
465   setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
466 
467   // We can't return success/failure, only the old value,
468   // let LLVM add the comparison
469   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
470                      Expand);
471 
472   setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
473 
474   setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
475 
476   // FIXME: This should be narrowed to i32, but that only happens if i64 is
477   // illegal.
478   // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
479   setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
480 
481   // On SI this is s_memtime and s_memrealtime on VI.
482   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
483 
484   if (Subtarget->hasSMemRealTime() ||
485       Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
486     setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
487   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
488 
489   if (Subtarget->has16BitInsts()) {
490     setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
491     setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
492   } else {
493     setOperationAction(ISD::FSQRT, MVT::f16, Custom);
494   }
495 
496   if (Subtarget->hasMadMacF32Insts())
497     setOperationAction(ISD::FMAD, MVT::f32, Legal);
498 
499   if (!Subtarget->hasBFI())
500     // fcopysign can be done in a single instruction with BFI.
501     setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
502 
503   if (!Subtarget->hasBCNT(32))
504     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
505 
506   if (!Subtarget->hasBCNT(64))
507     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
508 
509   if (Subtarget->hasFFBH())
510     setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
511 
512   if (Subtarget->hasFFBL())
513     setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
514 
515   // We only really have 32-bit BFE instructions (and 16-bit on VI).
516   //
517   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
518   // effort to match them now. We want this to be false for i64 cases when the
519   // extraction isn't restricted to the upper or lower half. Ideally we would
520   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
521   // span the midpoint are probably relatively rare, so don't worry about them
522   // for now.
523   if (Subtarget->hasBFE())
524     setHasExtractBitsInsn(true);
525 
526   // Clamp modifier on add/sub
527   if (Subtarget->hasIntClamp())
528     setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
529 
530   if (Subtarget->hasAddNoCarry())
531     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
532                        Legal);
533 
534   setOperationAction(
535       {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
536       {MVT::f32, MVT::f64}, Custom);
537 
538   // These are really only legal for ieee_mode functions. We should be avoiding
539   // them for functions that don't have ieee_mode enabled, so just say they are
540   // legal.
541   setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
542                      {MVT::f32, MVT::f64}, Legal);
543 
544   if (Subtarget->haveRoundOpsF64())
545     setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
546                        Legal);
547   else
548     setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
549                        MVT::f64, Custom);
550 
551   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
552   setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
553                      Legal);
554   setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
555 
556   setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
557   setOperationAction(ISD::FDIV, MVT::f64, Custom);
558 
559   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
560   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
561 
562   // Custom lower these because we can't specify a rule based on an illegal
563   // source bf16.
564   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
565   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
566 
567   if (Subtarget->has16BitInsts()) {
568     setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
569                         ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
570                        MVT::i16, Legal);
571 
572     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
573 
574     setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
575                        MVT::i16, Expand);
576 
577     setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
578                         ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
579                         ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
580                         ISD::CTPOP},
581                        MVT::i16, Promote);
582 
583     setOperationAction(ISD::LOAD, MVT::i16, Custom);
584 
585     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
586 
587     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
588     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
589     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
590     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
591 
592     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
593     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
594     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i1, Custom);
595 
596     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
597 
598     // F16 - Constant Actions.
599     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
600     setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
601 
602     // F16 - Load/Store Actions.
603     setOperationAction(ISD::LOAD, MVT::f16, Promote);
604     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
605     setOperationAction(ISD::STORE, MVT::f16, Promote);
606     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
607 
608     // BF16 - Load/Store Actions.
609     setOperationAction(ISD::LOAD, MVT::bf16, Promote);
610     AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
611     setOperationAction(ISD::STORE, MVT::bf16, Promote);
612     AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
613 
614     // F16 - VOP1 Actions.
615     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
616                         ISD::FSIN, ISD::FROUND},
617                        MVT::f16, Custom);
618 
619     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
620     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
621 
622     // F16 - VOP2 Actions.
623     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
624                        Expand);
625     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
626     setOperationAction(ISD::FFREXP, MVT::f16, Custom);
627     setOperationAction(ISD::FDIV, MVT::f16, Custom);
628 
629     // F16 - VOP3 Actions.
630     setOperationAction(ISD::FMA, MVT::f16, Legal);
631     if (STI.hasMadF16())
632       setOperationAction(ISD::FMAD, MVT::f16, Legal);
633 
634     for (MVT VT :
635          {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
636           MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
637           MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
638       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
639         switch (Op) {
640         case ISD::LOAD:
641         case ISD::STORE:
642         case ISD::BUILD_VECTOR:
643         case ISD::BITCAST:
644         case ISD::UNDEF:
645         case ISD::EXTRACT_VECTOR_ELT:
646         case ISD::INSERT_VECTOR_ELT:
647         case ISD::INSERT_SUBVECTOR:
648         case ISD::SCALAR_TO_VECTOR:
649         case ISD::IS_FPCLASS:
650           break;
651         case ISD::EXTRACT_SUBVECTOR:
652         case ISD::CONCAT_VECTORS:
653           setOperationAction(Op, VT, Custom);
654           break;
655         default:
656           setOperationAction(Op, VT, Expand);
657           break;
658         }
659       }
660     }
661 
662     // v_perm_b32 can handle either of these.
663     setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
664     setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
665 
666     // XXX - Do these do anything? Vector constants turn into build_vector.
667     setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
668 
669     setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
670                        Legal);
671 
672     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
673     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
674     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
675     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
676 
677     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
678     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
679     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
680     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
681 
682     setOperationAction(ISD::AND, MVT::v2i16, Promote);
683     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
684     setOperationAction(ISD::OR, MVT::v2i16, Promote);
685     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
686     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
687     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
688 
689     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
690     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
691     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
692     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
693     setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
694     AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
695 
696     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
697     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
698     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
699     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
700     setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
701     AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
702 
703     setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
704     AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
705     setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
706     AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
707     setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
708     AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
709 
710     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
711     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
712     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
713     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
714 
715     setOperationAction(ISD::STORE, MVT::v8i16, Promote);
716     AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
717     setOperationAction(ISD::STORE, MVT::v8f16, Promote);
718     AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
719     setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
720     AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
721 
722     setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
723     AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
724     setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
725     AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
726     setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
727     AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
728 
729     setOperationAction(ISD::STORE, MVT::v16i16, Promote);
730     AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
731     setOperationAction(ISD::STORE, MVT::v16f16, Promote);
732     AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
733     setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
734     AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
735 
736     setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
737     AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
738     setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
739     AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
740     setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
741     AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
742 
743     setOperationAction(ISD::STORE, MVT::v32i16, Promote);
744     AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
745     setOperationAction(ISD::STORE, MVT::v32f16, Promote);
746     AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
747     setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
748     AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
749 
750     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
751                        MVT::v2i32, Expand);
752     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
753 
754     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
755                        MVT::v4i32, Expand);
756 
757     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
758                        MVT::v8i32, Expand);
759 
760     setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
761                        Subtarget->hasVOP3PInsts() ? Legal : Custom);
762 
763     setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
764     // This isn't really legal, but this avoids the legalizer unrolling it (and
765     // allows matching fneg (fabs x) patterns)
766     setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
767 
768     // Can do this in one BFI plus a constant materialize.
769     setOperationAction(ISD::FCOPYSIGN,
770                        {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
771                         MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
772                         MVT::v32f16, MVT::v32bf16},
773                        Custom);
774 
775     setOperationAction(
776         {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
777         MVT::f16, Custom);
778     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
779 
780     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
781                         ISD::FMAXIMUMNUM},
782                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
783                        Custom);
784 
785     setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
786                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
787                        Expand);
788 
789     for (MVT Vec16 :
790          {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
791           MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792       setOperationAction(
793           {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
794           Vec16, Custom);
795       setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
796     }
797   }
798 
799   if (Subtarget->hasVOP3PInsts()) {
800     setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
801                         ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
802                         ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
803                        MVT::v2i16, Legal);
804 
805     setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
806                         ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
807                        MVT::v2f16, Legal);
808 
809     setOperationAction(ISD::EXTRACT_VECTOR_ELT,
810                        {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
811 
812     setOperationAction(ISD::VECTOR_SHUFFLE,
813                        {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
814                         MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
815                         MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
816                        Custom);
817 
818     for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
819       // Split vector operations.
820       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
821                           ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
822                           ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
823                           ISD::SSUBSAT},
824                          VT, Custom);
825 
826     for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
827       // Split vector operations.
828       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
829                          VT, Custom);
830 
831     setOperationAction(
832         {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
833         {MVT::v2f16, MVT::v4f16}, Custom);
834 
835     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
836     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
837                        Custom);
838 
839     if (Subtarget->hasPackedFP32Ops()) {
840       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
841                          MVT::v2f32, Legal);
842       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
843                          {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
844                          Custom);
845     }
846   }
847 
848   setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
849 
850   if (Subtarget->has16BitInsts()) {
851     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
852     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
853     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
854     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
855   } else {
856     // Legalization hack.
857     setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
858 
859     setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
860   }
861 
862   setOperationAction(ISD::SELECT,
863                      {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
864                       MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
865                       MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
866                       MVT::v32f16, MVT::v32bf16},
867                      Custom);
868 
869   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
870 
871   if (Subtarget->hasScalarSMulU64())
872     setOperationAction(ISD::MUL, MVT::i64, Custom);
873 
874   if (Subtarget->hasMad64_32())
875     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
876 
877   if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
878     setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
879 
880   if (Subtarget->hasIEEEMinimumMaximumInsts()) {
881     setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
882                        {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
883   } else {
884     // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
885     if (Subtarget->hasMinimum3Maximum3F32())
886       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
887 
888     if (Subtarget->hasMinimum3Maximum3PKF16()) {
889       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
890 
891       // If only the vector form is available, we need to widen to a vector.
892       if (!Subtarget->hasMinimum3Maximum3F16())
893         setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
894     }
895   }
896 
897   if (Subtarget->hasVOP3PInsts()) {
898     // We want to break these into v2f16 pieces, not scalarize.
899     setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
900                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
901                        Custom);
902   }
903 
904   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
905                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
906                       MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
907                       MVT::i8},
908                      Custom);
909 
910   setOperationAction(ISD::INTRINSIC_W_CHAIN,
911                      {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
912                       MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
913                       MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
914                       MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
915                      Custom);
916 
917   setOperationAction(ISD::INTRINSIC_VOID,
918                      {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
919                       MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
920                       MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
921                       MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
922                      Custom);
923 
924   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
925   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
926   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
927   setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
928   setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
929 
930   // TODO: Could move this to custom lowering, could benefit from combines on
931   // extract of relevant bits.
932   setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
933 
934   setOperationAction(ISD::MUL, MVT::i1, Promote);
935 
936   if (Subtarget->hasBF16ConversionInsts()) {
937     setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
938     setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
939   }
940 
941   if (Subtarget->hasCvtPkF16F32Inst()) {
942     setOperationAction(ISD::FP_ROUND,
943                        {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
944                        Custom);
945   }
946 
947   setTargetDAGCombine({ISD::ADD,
948                        ISD::PTRADD,
949                        ISD::UADDO_CARRY,
950                        ISD::SUB,
951                        ISD::USUBO_CARRY,
952                        ISD::MUL,
953                        ISD::FADD,
954                        ISD::FSUB,
955                        ISD::FDIV,
956                        ISD::FMUL,
957                        ISD::FMINNUM,
958                        ISD::FMAXNUM,
959                        ISD::FMINNUM_IEEE,
960                        ISD::FMAXNUM_IEEE,
961                        ISD::FMINIMUM,
962                        ISD::FMAXIMUM,
963                        ISD::FMINIMUMNUM,
964                        ISD::FMAXIMUMNUM,
965                        ISD::FMA,
966                        ISD::SMIN,
967                        ISD::SMAX,
968                        ISD::UMIN,
969                        ISD::UMAX,
970                        ISD::SETCC,
971                        ISD::SELECT,
972                        ISD::SMIN,
973                        ISD::SMAX,
974                        ISD::UMIN,
975                        ISD::UMAX,
976                        ISD::AND,
977                        ISD::OR,
978                        ISD::XOR,
979                        ISD::SHL,
980                        ISD::SRL,
981                        ISD::SRA,
982                        ISD::FSHR,
983                        ISD::SINT_TO_FP,
984                        ISD::UINT_TO_FP,
985                        ISD::FCANONICALIZE,
986                        ISD::SCALAR_TO_VECTOR,
987                        ISD::ZERO_EXTEND,
988                        ISD::SIGN_EXTEND_INREG,
989                        ISD::EXTRACT_VECTOR_ELT,
990                        ISD::INSERT_VECTOR_ELT,
991                        ISD::FCOPYSIGN});
992 
993   if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
994     setTargetDAGCombine(ISD::FP_ROUND);
995 
996   // All memory operations. Some folding on the pointer operand is done to help
997   // matching the constant offsets in the addressing modes.
998   setTargetDAGCombine({ISD::LOAD,
999                        ISD::STORE,
1000                        ISD::ATOMIC_LOAD,
1001                        ISD::ATOMIC_STORE,
1002                        ISD::ATOMIC_CMP_SWAP,
1003                        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1004                        ISD::ATOMIC_SWAP,
1005                        ISD::ATOMIC_LOAD_ADD,
1006                        ISD::ATOMIC_LOAD_SUB,
1007                        ISD::ATOMIC_LOAD_AND,
1008                        ISD::ATOMIC_LOAD_OR,
1009                        ISD::ATOMIC_LOAD_XOR,
1010                        ISD::ATOMIC_LOAD_NAND,
1011                        ISD::ATOMIC_LOAD_MIN,
1012                        ISD::ATOMIC_LOAD_MAX,
1013                        ISD::ATOMIC_LOAD_UMIN,
1014                        ISD::ATOMIC_LOAD_UMAX,
1015                        ISD::ATOMIC_LOAD_FADD,
1016                        ISD::ATOMIC_LOAD_FMIN,
1017                        ISD::ATOMIC_LOAD_FMAX,
1018                        ISD::ATOMIC_LOAD_UINC_WRAP,
1019                        ISD::ATOMIC_LOAD_UDEC_WRAP,
1020                        ISD::INTRINSIC_VOID,
1021                        ISD::INTRINSIC_W_CHAIN});
1022 
1023   // FIXME: In other contexts we pretend this is a per-function property.
1024   setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1025 
1026   setSchedulingPreference(Sched::RegPressure);
1027 }
1028 
1029 const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1030 
1031 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1032   static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1033   return RCRegs;
1034 }
1035 
1036 //===----------------------------------------------------------------------===//
1037 // TargetLowering queries
1038 //===----------------------------------------------------------------------===//
1039 
1040 // v_mad_mix* support a conversion from f16 to f32.
1041 //
1042 // There is only one special case when denormals are enabled we don't currently,
1043 // where this is OK to use.
1044 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1045                                        EVT DestVT, EVT SrcVT) const {
1046   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1047           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1048          DestVT.getScalarType() == MVT::f32 &&
1049          SrcVT.getScalarType() == MVT::f16 &&
1050          // TODO: This probably only requires no input flushing?
1051          denormalModeIsFlushAllF32(DAG.getMachineFunction());
1052 }
1053 
1054 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1055                                        LLT DestTy, LLT SrcTy) const {
1056   return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1057           (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1058          DestTy.getScalarSizeInBits() == 32 &&
1059          SrcTy.getScalarSizeInBits() == 16 &&
1060          // TODO: This probably only requires no input flushing?
1061          denormalModeIsFlushAllF32(*MI.getMF());
1062 }
1063 
1064 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1065   // SI has some legal vector types, but no legal vector operations. Say no
1066   // shuffles are legal in order to prefer scalarizing some vector operations.
1067   return false;
1068 }
1069 
1070 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1071                                                     CallingConv::ID CC,
1072                                                     EVT VT) const {
1073   if (CC == CallingConv::AMDGPU_KERNEL)
1074     return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1075 
1076   if (VT.isVector()) {
1077     EVT ScalarVT = VT.getScalarType();
1078     unsigned Size = ScalarVT.getSizeInBits();
1079     if (Size == 16) {
1080       if (Subtarget->has16BitInsts()) {
1081         if (VT.isInteger())
1082           return MVT::v2i16;
1083         return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1084       }
1085       return VT.isInteger() ? MVT::i32 : MVT::f32;
1086     }
1087 
1088     if (Size < 16)
1089       return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1090     return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1091   }
1092 
1093   if (VT.getSizeInBits() > 32)
1094     return MVT::i32;
1095 
1096   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1097 }
1098 
1099 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1100                                                          CallingConv::ID CC,
1101                                                          EVT VT) const {
1102   if (CC == CallingConv::AMDGPU_KERNEL)
1103     return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1104 
1105   if (VT.isVector()) {
1106     unsigned NumElts = VT.getVectorNumElements();
1107     EVT ScalarVT = VT.getScalarType();
1108     unsigned Size = ScalarVT.getSizeInBits();
1109 
1110     // FIXME: Should probably promote 8-bit vectors to i16.
1111     if (Size == 16 && Subtarget->has16BitInsts())
1112       return (NumElts + 1) / 2;
1113 
1114     if (Size <= 32)
1115       return NumElts;
1116 
1117     if (Size > 32)
1118       return NumElts * ((Size + 31) / 32);
1119   } else if (VT.getSizeInBits() > 32)
1120     return (VT.getSizeInBits() + 31) / 32;
1121 
1122   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1123 }
1124 
1125 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1126     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1127     unsigned &NumIntermediates, MVT &RegisterVT) const {
1128   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1129     unsigned NumElts = VT.getVectorNumElements();
1130     EVT ScalarVT = VT.getScalarType();
1131     unsigned Size = ScalarVT.getSizeInBits();
1132     // FIXME: We should fix the ABI to be the same on targets without 16-bit
1133     // support, but unless we can properly handle 3-vectors, it will be still be
1134     // inconsistent.
1135     if (Size == 16 && Subtarget->has16BitInsts()) {
1136       if (ScalarVT == MVT::bf16) {
1137         RegisterVT = MVT::i32;
1138         IntermediateVT = MVT::v2bf16;
1139       } else {
1140         RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1141         IntermediateVT = RegisterVT;
1142       }
1143       NumIntermediates = (NumElts + 1) / 2;
1144       return NumIntermediates;
1145     }
1146 
1147     if (Size == 32) {
1148       RegisterVT = ScalarVT.getSimpleVT();
1149       IntermediateVT = RegisterVT;
1150       NumIntermediates = NumElts;
1151       return NumIntermediates;
1152     }
1153 
1154     if (Size < 16 && Subtarget->has16BitInsts()) {
1155       // FIXME: Should probably form v2i16 pieces
1156       RegisterVT = MVT::i16;
1157       IntermediateVT = ScalarVT;
1158       NumIntermediates = NumElts;
1159       return NumIntermediates;
1160     }
1161 
1162     if (Size != 16 && Size <= 32) {
1163       RegisterVT = MVT::i32;
1164       IntermediateVT = ScalarVT;
1165       NumIntermediates = NumElts;
1166       return NumIntermediates;
1167     }
1168 
1169     if (Size > 32) {
1170       RegisterVT = MVT::i32;
1171       IntermediateVT = RegisterVT;
1172       NumIntermediates = NumElts * ((Size + 31) / 32);
1173       return NumIntermediates;
1174     }
1175   }
1176 
1177   return TargetLowering::getVectorTypeBreakdownForCallingConv(
1178       Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1179 }
1180 
1181 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1182                                  const DataLayout &DL, Type *Ty,
1183                                  unsigned MaxNumLanes) {
1184   assert(MaxNumLanes != 0);
1185 
1186   LLVMContext &Ctx = Ty->getContext();
1187   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1188     unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1189     return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1190                             NumElts);
1191   }
1192 
1193   return TLI.getValueType(DL, Ty);
1194 }
1195 
1196 // Peek through TFE struct returns to only use the data size.
1197 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1198                                    const DataLayout &DL, Type *Ty,
1199                                    unsigned MaxNumLanes) {
1200   auto *ST = dyn_cast<StructType>(Ty);
1201   if (!ST)
1202     return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1203 
1204   // TFE intrinsics return an aggregate type.
1205   assert(ST->getNumContainedTypes() == 2 &&
1206          ST->getContainedType(1)->isIntegerTy(32));
1207   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1208 }
1209 
1210 /// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1211 /// in-memory representation. This return value is a custom type because there
1212 /// is no MVT::i160 and adding one breaks integer promotion logic. While this
1213 /// could cause issues during codegen, these address space 7 pointers will be
1214 /// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1215 /// in order to allow pre-codegen passes that query TargetTransformInfo, often
1216 /// for cost modeling, to work. (This also sets us up decently for doing the
1217 /// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1218 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1219   if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1220     return MVT::amdgpuBufferFatPointer;
1221   if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1222       DL.getPointerSizeInBits(AS) == 192)
1223     return MVT::amdgpuBufferStridedPointer;
1224   return AMDGPUTargetLowering::getPointerTy(DL, AS);
1225 }
1226 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1227 /// v8i32 when padding is added.
1228 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1229 /// also v8i32 with padding.
1230 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1231   if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1232        DL.getPointerSizeInBits(AS) == 160) ||
1233       (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1234        DL.getPointerSizeInBits(AS) == 192))
1235     return MVT::v8i32;
1236   return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1237 }
1238 
1239 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1240                                           const CallInst &CI,
1241                                           MachineFunction &MF,
1242                                           unsigned IntrID) const {
1243   Info.flags = MachineMemOperand::MONone;
1244   if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1245     Info.flags |= MachineMemOperand::MOInvariant;
1246   if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1247     Info.flags |= MachineMemOperand::MONonTemporal;
1248   Info.flags |= getTargetMMOFlags(CI);
1249 
1250   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1251           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1252     AttributeSet Attr =
1253         Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID);
1254     MemoryEffects ME = Attr.getMemoryEffects();
1255     if (ME.doesNotAccessMemory())
1256       return false;
1257 
1258     // TODO: Should images get their own address space?
1259     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1260 
1261     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1262     if (RsrcIntr->IsImage) {
1263       const AMDGPU::ImageDimIntrinsicInfo *Intr =
1264           AMDGPU::getImageDimIntrinsicInfo(IntrID);
1265       BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1266       Info.align.reset();
1267     }
1268 
1269     Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1270     if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1271       if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1272         // We conservatively set the memory operand of a buffer intrinsic to the
1273         // base resource pointer, so that we can access alias information about
1274         // those pointers. Cases like "this points at the same value
1275         // but with a different offset" are handled in
1276         // areMemAccessesTriviallyDisjoint.
1277         Info.ptrVal = RsrcArg;
1278     }
1279 
1280     bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1281     if (!IsSPrefetch) {
1282       auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1283       if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1284         Info.flags |= MachineMemOperand::MOVolatile;
1285     }
1286 
1287     Info.flags |= MachineMemOperand::MODereferenceable;
1288     if (ME.onlyReadsMemory()) {
1289       if (RsrcIntr->IsImage) {
1290         unsigned MaxNumLanes = 4;
1291 
1292         if (!BaseOpcode->Gather4) {
1293           // If this isn't a gather, we may have excess loaded elements in the
1294           // IR type. Check the dmask for the real number of elements loaded.
1295           unsigned DMask =
1296               cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1297           MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1298         }
1299 
1300         Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1301                                              CI.getType(), MaxNumLanes);
1302       } else {
1303         Info.memVT =
1304             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1305                                     std::numeric_limits<unsigned>::max());
1306       }
1307 
1308       // FIXME: What does alignment mean for an image?
1309       Info.opc = ISD::INTRINSIC_W_CHAIN;
1310       Info.flags |= MachineMemOperand::MOLoad;
1311     } else if (ME.onlyWritesMemory()) {
1312       Info.opc = ISD::INTRINSIC_VOID;
1313 
1314       Type *DataTy = CI.getArgOperand(0)->getType();
1315       if (RsrcIntr->IsImage) {
1316         unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1317         unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1318         Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1319                                            DMaskLanes);
1320       } else
1321         Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1322 
1323       Info.flags |= MachineMemOperand::MOStore;
1324     } else {
1325       // Atomic, NoReturn Sampler or prefetch
1326       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1327                                           : ISD::INTRINSIC_W_CHAIN;
1328       Info.flags |=
1329           MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1330 
1331       if (!IsSPrefetch)
1332         Info.flags |= MachineMemOperand::MOStore;
1333 
1334       switch (IntrID) {
1335       default:
1336         if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1337           // Fake memory access type for no return sampler intrinsics
1338           Info.memVT = MVT::i32;
1339         } else {
1340           // XXX - Should this be volatile without known ordering?
1341           Info.flags |= MachineMemOperand::MOVolatile;
1342           Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1343         }
1344         break;
1345       case Intrinsic::amdgcn_raw_buffer_load_lds:
1346       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1347       case Intrinsic::amdgcn_struct_buffer_load_lds:
1348       case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1349         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1350         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1351         Info.ptrVal = CI.getArgOperand(1);
1352         return true;
1353       }
1354       case Intrinsic::amdgcn_raw_atomic_buffer_load:
1355       case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1356       case Intrinsic::amdgcn_struct_atomic_buffer_load:
1357       case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1358         Info.memVT =
1359             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1360                                     std::numeric_limits<unsigned>::max());
1361         Info.flags &= ~MachineMemOperand::MOStore;
1362         return true;
1363       }
1364       }
1365     }
1366     return true;
1367   }
1368 
1369   switch (IntrID) {
1370   case Intrinsic::amdgcn_ds_ordered_add:
1371   case Intrinsic::amdgcn_ds_ordered_swap: {
1372     Info.opc = ISD::INTRINSIC_W_CHAIN;
1373     Info.memVT = MVT::getVT(CI.getType());
1374     Info.ptrVal = CI.getOperand(0);
1375     Info.align.reset();
1376     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1377 
1378     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1379     if (!Vol->isZero())
1380       Info.flags |= MachineMemOperand::MOVolatile;
1381 
1382     return true;
1383   }
1384   case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1385   case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1386     Info.opc = ISD::INTRINSIC_W_CHAIN;
1387     Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1388     Info.ptrVal = nullptr;
1389     Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1390     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1391     return true;
1392   }
1393   case Intrinsic::amdgcn_ds_append:
1394   case Intrinsic::amdgcn_ds_consume: {
1395     Info.opc = ISD::INTRINSIC_W_CHAIN;
1396     Info.memVT = MVT::getVT(CI.getType());
1397     Info.ptrVal = CI.getOperand(0);
1398     Info.align.reset();
1399     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1400 
1401     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1402     if (!Vol->isZero())
1403       Info.flags |= MachineMemOperand::MOVolatile;
1404 
1405     return true;
1406   }
1407   case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1408   case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1409     Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1410                    ? ISD::INTRINSIC_W_CHAIN
1411                    : ISD::INTRINSIC_VOID;
1412     Info.memVT = MVT::getVT(CI.getType());
1413     Info.ptrVal = CI.getOperand(0);
1414     Info.memVT = MVT::i64;
1415     Info.size = 8;
1416     Info.align.reset();
1417     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1418     return true;
1419   }
1420   case Intrinsic::amdgcn_global_atomic_csub: {
1421     Info.opc = ISD::INTRINSIC_W_CHAIN;
1422     Info.memVT = MVT::getVT(CI.getType());
1423     Info.ptrVal = CI.getOperand(0);
1424     Info.align.reset();
1425     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1426                   MachineMemOperand::MOVolatile;
1427     return true;
1428   }
1429   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1430   case Intrinsic::amdgcn_image_bvh_intersect_ray:
1431   case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1432     Info.opc = ISD::INTRINSIC_W_CHAIN;
1433     Info.memVT =
1434         MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1435                        ? CI.getType()
1436                        : cast<StructType>(CI.getType())
1437                              ->getElementType(0)); // XXX: what is correct VT?
1438 
1439     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1440     Info.align.reset();
1441     Info.flags |=
1442         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1443     return true;
1444   }
1445   case Intrinsic::amdgcn_global_atomic_fmin_num:
1446   case Intrinsic::amdgcn_global_atomic_fmax_num:
1447   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1448   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1449   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1450   case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1451     Info.opc = ISD::INTRINSIC_W_CHAIN;
1452     Info.memVT = MVT::getVT(CI.getType());
1453     Info.ptrVal = CI.getOperand(0);
1454     Info.align.reset();
1455     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1456                   MachineMemOperand::MODereferenceable |
1457                   MachineMemOperand::MOVolatile;
1458     return true;
1459   }
1460   case Intrinsic::amdgcn_ds_load_tr6_b96:
1461   case Intrinsic::amdgcn_ds_load_tr4_b64:
1462   case Intrinsic::amdgcn_ds_load_tr8_b64:
1463   case Intrinsic::amdgcn_ds_load_tr16_b128:
1464   case Intrinsic::amdgcn_global_load_tr6_b96:
1465   case Intrinsic::amdgcn_global_load_tr4_b64:
1466   case Intrinsic::amdgcn_global_load_tr_b64:
1467   case Intrinsic::amdgcn_global_load_tr_b128:
1468   case Intrinsic::amdgcn_ds_read_tr4_b64:
1469   case Intrinsic::amdgcn_ds_read_tr6_b96:
1470   case Intrinsic::amdgcn_ds_read_tr8_b64:
1471   case Intrinsic::amdgcn_ds_read_tr16_b64: {
1472     Info.opc = ISD::INTRINSIC_W_CHAIN;
1473     Info.memVT = MVT::getVT(CI.getType());
1474     Info.ptrVal = CI.getOperand(0);
1475     Info.align.reset();
1476     Info.flags |= MachineMemOperand::MOLoad;
1477     return true;
1478   }
1479   case Intrinsic::amdgcn_ds_gws_init:
1480   case Intrinsic::amdgcn_ds_gws_barrier:
1481   case Intrinsic::amdgcn_ds_gws_sema_v:
1482   case Intrinsic::amdgcn_ds_gws_sema_br:
1483   case Intrinsic::amdgcn_ds_gws_sema_p:
1484   case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1485     Info.opc = ISD::INTRINSIC_VOID;
1486 
1487     const GCNTargetMachine &TM =
1488         static_cast<const GCNTargetMachine &>(getTargetMachine());
1489 
1490     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1491     Info.ptrVal = MFI->getGWSPSV(TM);
1492 
1493     // This is an abstract access, but we need to specify a type and size.
1494     Info.memVT = MVT::i32;
1495     Info.size = 4;
1496     Info.align = Align(4);
1497 
1498     if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1499       Info.flags |= MachineMemOperand::MOLoad;
1500     else
1501       Info.flags |= MachineMemOperand::MOStore;
1502     return true;
1503   }
1504   case Intrinsic::amdgcn_load_to_lds:
1505   case Intrinsic::amdgcn_global_load_lds: {
1506     Info.opc = ISD::INTRINSIC_VOID;
1507     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1508     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1509     Info.ptrVal = CI.getArgOperand(1);
1510     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1511     return true;
1512   }
1513   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1514   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1515   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1516   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1517     Info.opc = ISD::INTRINSIC_W_CHAIN;
1518 
1519     const GCNTargetMachine &TM =
1520         static_cast<const GCNTargetMachine &>(getTargetMachine());
1521 
1522     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1523     Info.ptrVal = MFI->getGWSPSV(TM);
1524 
1525     // This is an abstract access, but we need to specify a type and size.
1526     Info.memVT = MVT::i32;
1527     Info.size = 4;
1528     Info.align = Align(4);
1529 
1530     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1531     return true;
1532   }
1533   case Intrinsic::amdgcn_s_prefetch_data: {
1534     Info.opc = ISD::INTRINSIC_VOID;
1535     Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1536     Info.ptrVal = CI.getArgOperand(0);
1537     Info.flags |= MachineMemOperand::MOLoad;
1538     return true;
1539   }
1540   default:
1541     return false;
1542   }
1543 }
1544 
1545 void SITargetLowering::CollectTargetIntrinsicOperands(
1546     const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1547   switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1548   case Intrinsic::amdgcn_addrspacecast_nonnull: {
1549     // The DAG's ValueType loses the addrspaces.
1550     // Add them as 2 extra Constant operands "from" and "to".
1551     unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1552     unsigned DstAS = I.getType()->getPointerAddressSpace();
1553     Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1554     Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1555     break;
1556   }
1557   default:
1558     break;
1559   }
1560 }
1561 
1562 bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1563                                             SmallVectorImpl<Value *> &Ops,
1564                                             Type *&AccessTy) const {
1565   Value *Ptr = nullptr;
1566   switch (II->getIntrinsicID()) {
1567   case Intrinsic::amdgcn_atomic_cond_sub_u32:
1568   case Intrinsic::amdgcn_ds_append:
1569   case Intrinsic::amdgcn_ds_consume:
1570   case Intrinsic::amdgcn_ds_load_tr8_b64:
1571   case Intrinsic::amdgcn_ds_load_tr16_b128:
1572   case Intrinsic::amdgcn_ds_load_tr4_b64:
1573   case Intrinsic::amdgcn_ds_load_tr6_b96:
1574   case Intrinsic::amdgcn_ds_read_tr4_b64:
1575   case Intrinsic::amdgcn_ds_read_tr6_b96:
1576   case Intrinsic::amdgcn_ds_read_tr8_b64:
1577   case Intrinsic::amdgcn_ds_read_tr16_b64:
1578   case Intrinsic::amdgcn_ds_ordered_add:
1579   case Intrinsic::amdgcn_ds_ordered_swap:
1580   case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1581   case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1582   case Intrinsic::amdgcn_flat_atomic_fmax_num:
1583   case Intrinsic::amdgcn_flat_atomic_fmin_num:
1584   case Intrinsic::amdgcn_global_atomic_csub:
1585   case Intrinsic::amdgcn_global_atomic_fmax_num:
1586   case Intrinsic::amdgcn_global_atomic_fmin_num:
1587   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1588   case Intrinsic::amdgcn_global_load_tr_b64:
1589   case Intrinsic::amdgcn_global_load_tr_b128:
1590   case Intrinsic::amdgcn_global_load_tr4_b64:
1591   case Intrinsic::amdgcn_global_load_tr6_b96:
1592     Ptr = II->getArgOperand(0);
1593     break;
1594   case Intrinsic::amdgcn_load_to_lds:
1595   case Intrinsic::amdgcn_global_load_lds:
1596     Ptr = II->getArgOperand(1);
1597     break;
1598   default:
1599     return false;
1600   }
1601   AccessTy = II->getType();
1602   Ops.push_back(Ptr);
1603   return true;
1604 }
1605 
1606 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1607                                                  unsigned AddrSpace) const {
1608   if (!Subtarget->hasFlatInstOffsets()) {
1609     // Flat instructions do not have offsets, and only have the register
1610     // address.
1611     return AM.BaseOffs == 0 && AM.Scale == 0;
1612   }
1613 
1614   decltype(SIInstrFlags::FLAT) FlatVariant =
1615       AddrSpace == AMDGPUAS::GLOBAL_ADDRESS    ? SIInstrFlags::FlatGlobal
1616       : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1617                                                : SIInstrFlags::FLAT;
1618 
1619   return AM.Scale == 0 &&
1620          (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1621                                   AM.BaseOffs, AddrSpace, FlatVariant));
1622 }
1623 
1624 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1625   if (Subtarget->hasFlatGlobalInsts())
1626     return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS);
1627 
1628   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1629     // Assume the we will use FLAT for all global memory accesses
1630     // on VI.
1631     // FIXME: This assumption is currently wrong.  On VI we still use
1632     // MUBUF instructions for the r + i addressing mode.  As currently
1633     // implemented, the MUBUF instructions only work on buffer < 4GB.
1634     // It may be possible to support > 4GB buffers with MUBUF instructions,
1635     // by setting the stride value in the resource descriptor which would
1636     // increase the size limit to (stride * 4GB).  However, this is risky,
1637     // because it has never been validated.
1638     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1639   }
1640 
1641   return isLegalMUBUFAddressingMode(AM);
1642 }
1643 
1644 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1645   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1646   // additionally can do r + r + i with addr64. 32-bit has more addressing
1647   // mode options. Depending on the resource constant, it can also do
1648   // (i64 r0) + (i32 r1) * (i14 i).
1649   //
1650   // Private arrays end up using a scratch buffer most of the time, so also
1651   // assume those use MUBUF instructions. Scratch loads / stores are currently
1652   // implemented as mubuf instructions with offen bit set, so slightly
1653   // different than the normal addr64.
1654   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1655   if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1656     return false;
1657 
1658   // FIXME: Since we can split immediate into soffset and immediate offset,
1659   // would it make sense to allow any immediate?
1660 
1661   switch (AM.Scale) {
1662   case 0: // r + i or just i, depending on HasBaseReg.
1663     return true;
1664   case 1:
1665     return true; // We have r + r or r + i.
1666   case 2:
1667     if (AM.HasBaseReg) {
1668       // Reject 2 * r + r.
1669       return false;
1670     }
1671 
1672     // Allow 2 * r as r + r
1673     // Or  2 * r + i is allowed as r + r + i.
1674     return true;
1675   default: // Don't allow n * r
1676     return false;
1677   }
1678 }
1679 
1680 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1681                                              const AddrMode &AM, Type *Ty,
1682                                              unsigned AS,
1683                                              Instruction *I) const {
1684   // No global is ever allowed as a base.
1685   if (AM.BaseGV)
1686     return false;
1687 
1688   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1689     return isLegalGlobalAddressingMode(AM);
1690 
1691   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1692       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1693       AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1694       AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1695     // If the offset isn't a multiple of 4, it probably isn't going to be
1696     // correctly aligned.
1697     // FIXME: Can we get the real alignment here?
1698     if (AM.BaseOffs % 4 != 0)
1699       return isLegalMUBUFAddressingMode(AM);
1700 
1701     if (!Subtarget->hasScalarSubwordLoads()) {
1702       // There are no SMRD extloads, so if we have to do a small type access we
1703       // will use a MUBUF load.
1704       // FIXME?: We also need to do this if unaligned, but we don't know the
1705       // alignment here.
1706       if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1707         return isLegalGlobalAddressingMode(AM);
1708     }
1709 
1710     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1711       // SMRD instructions have an 8-bit, dword offset on SI.
1712       if (!isUInt<8>(AM.BaseOffs / 4))
1713         return false;
1714     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1715       // On CI+, this can also be a 32-bit literal constant offset. If it fits
1716       // in 8-bits, it can use a smaller encoding.
1717       if (!isUInt<32>(AM.BaseOffs / 4))
1718         return false;
1719     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1720       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1721       if (!isUInt<20>(AM.BaseOffs))
1722         return false;
1723     } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1724       // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1725       // for S_BUFFER_* instructions).
1726       if (!isInt<21>(AM.BaseOffs))
1727         return false;
1728     } else {
1729       // On GFX12, all offsets are signed 24-bit in bytes.
1730       if (!isInt<24>(AM.BaseOffs))
1731         return false;
1732     }
1733 
1734     if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1735          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1736         AM.BaseOffs < 0) {
1737       // Scalar (non-buffer) loads can only use a negative offset if
1738       // soffset+offset is non-negative. Since the compiler can only prove that
1739       // in a few special cases, it is safer to claim that negative offsets are
1740       // not supported.
1741       return false;
1742     }
1743 
1744     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1745       return true;
1746 
1747     if (AM.Scale == 1 && AM.HasBaseReg)
1748       return true;
1749 
1750     return false;
1751   }
1752 
1753   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1754     return Subtarget->enableFlatScratch()
1755                ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
1756                : isLegalMUBUFAddressingMode(AM);
1757 
1758   if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1759       (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1760     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1761     // field.
1762     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1763     // an 8-bit dword offset but we don't know the alignment here.
1764     if (!isUInt<16>(AM.BaseOffs))
1765       return false;
1766 
1767     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1768       return true;
1769 
1770     if (AM.Scale == 1 && AM.HasBaseReg)
1771       return true;
1772 
1773     return false;
1774   }
1775 
1776   if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1777     // For an unknown address space, this usually means that this is for some
1778     // reason being used for pure arithmetic, and not based on some addressing
1779     // computation. We don't have instructions that compute pointers with any
1780     // addressing modes, so treat them as having no offset like flat
1781     // instructions.
1782     return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);
1783   }
1784 
1785   // Assume a user alias of global for unknown address spaces.
1786   return isLegalGlobalAddressingMode(AM);
1787 }
1788 
1789 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1790                                         const MachineFunction &MF) const {
1791   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1792     return (MemVT.getSizeInBits() <= 4 * 32);
1793   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1794     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1795     return (MemVT.getSizeInBits() <= MaxPrivateBits);
1796   }
1797   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1798     return (MemVT.getSizeInBits() <= 2 * 32);
1799   return true;
1800 }
1801 
1802 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1803     unsigned Size, unsigned AddrSpace, Align Alignment,
1804     MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1805   if (IsFast)
1806     *IsFast = 0;
1807 
1808   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1809       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1810     // Check if alignment requirements for ds_read/write instructions are
1811     // disabled.
1812     if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1813       return false;
1814 
1815     Align RequiredAlignment(
1816         PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1817     if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1818         Alignment < RequiredAlignment)
1819       return false;
1820 
1821     // Either, the alignment requirements are "enabled", or there is an
1822     // unaligned LDS access related hardware bug though alignment requirements
1823     // are "disabled". In either case, we need to check for proper alignment
1824     // requirements.
1825     //
1826     switch (Size) {
1827     case 64:
1828       // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1829       // address is negative, then the instruction is incorrectly treated as
1830       // out-of-bounds even if base + offsets is in bounds. Split vectorized
1831       // loads here to avoid emitting ds_read2_b32. We may re-combine the
1832       // load later in the SILoadStoreOptimizer.
1833       if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1834         return false;
1835 
1836       // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1837       // can do a 4 byte aligned, 8 byte access in a single operation using
1838       // ds_read2/write2_b32 with adjacent offsets.
1839       RequiredAlignment = Align(4);
1840 
1841       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1842         // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1843         // ds_write2_b32 depending on the alignment. In either case with either
1844         // alignment there is no faster way of doing this.
1845 
1846         // The numbers returned here and below are not additive, it is a 'speed
1847         // rank'. They are just meant to be compared to decide if a certain way
1848         // of lowering an operation is faster than another. For that purpose
1849         // naturally aligned operation gets it bitsize to indicate that "it
1850         // operates with a speed comparable to N-bit wide load". With the full
1851         // alignment ds128 is slower than ds96 for example. If underaligned it
1852         // is comparable to a speed of a single dword access, which would then
1853         // mean 32 < 128 and it is faster to issue a wide load regardless.
1854         // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1855         // wider load which will not be aligned anymore the latter is slower.
1856         if (IsFast)
1857           *IsFast = (Alignment >= RequiredAlignment) ? 64
1858                     : (Alignment < Align(4))         ? 32
1859                                                      : 1;
1860         return true;
1861       }
1862 
1863       break;
1864     case 96:
1865       if (!Subtarget->hasDS96AndDS128())
1866         return false;
1867 
1868       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1869       // gfx8 and older.
1870 
1871       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1872         // Naturally aligned access is fastest. However, also report it is Fast
1873         // if memory is aligned less than DWORD. A narrow load or store will be
1874         // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1875         // be more of them, so overall we will pay less penalty issuing a single
1876         // instruction.
1877 
1878         // See comment on the values above.
1879         if (IsFast)
1880           *IsFast = (Alignment >= RequiredAlignment) ? 96
1881                     : (Alignment < Align(4))         ? 32
1882                                                      : 1;
1883         return true;
1884       }
1885 
1886       break;
1887     case 128:
1888       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1889         return false;
1890 
1891       // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1892       // gfx8 and older, but  we can do a 8 byte aligned, 16 byte access in a
1893       // single operation using ds_read2/write2_b64.
1894       RequiredAlignment = Align(8);
1895 
1896       if (Subtarget->hasUnalignedDSAccessEnabled()) {
1897         // Naturally aligned access is fastest. However, also report it is Fast
1898         // if memory is aligned less than DWORD. A narrow load or store will be
1899         // be equally slow as a single ds_read_b128/ds_write_b128, but there
1900         // will be more of them, so overall we will pay less penalty issuing a
1901         // single instruction.
1902 
1903         // See comment on the values above.
1904         if (IsFast)
1905           *IsFast = (Alignment >= RequiredAlignment) ? 128
1906                     : (Alignment < Align(4))         ? 32
1907                                                      : 1;
1908         return true;
1909       }
1910 
1911       break;
1912     default:
1913       if (Size > 32)
1914         return false;
1915 
1916       break;
1917     }
1918 
1919     // See comment on the values above.
1920     // Note that we have a single-dword or sub-dword here, so if underaligned
1921     // it is a slowest possible access, hence returned value is 0.
1922     if (IsFast)
1923       *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1924 
1925     return Alignment >= RequiredAlignment ||
1926            Subtarget->hasUnalignedDSAccessEnabled();
1927   }
1928 
1929   // FIXME: We have to be conservative here and assume that flat operations
1930   // will access scratch.  If we had access to the IR function, then we
1931   // could determine if any private memory was used in the function.
1932   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1933       AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1934     bool AlignedBy4 = Alignment >= Align(4);
1935     if (IsFast)
1936       *IsFast = AlignedBy4;
1937 
1938     return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1939   }
1940 
1941   // So long as they are correct, wide global memory operations perform better
1942   // than multiple smaller memory ops -- even when misaligned
1943   if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1944     if (IsFast)
1945       *IsFast = Size;
1946 
1947     return Alignment >= Align(4) ||
1948            Subtarget->hasUnalignedBufferAccessEnabled();
1949   }
1950 
1951   // Ensure robust out-of-bounds guarantees for buffer accesses are met if
1952   // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
1953   // out-of-bounds behavior, but in the edge case where an access starts
1954   // out-of-bounds and then enter in-bounds, the entire access would be treated
1955   // as out-of-bounds. Prevent misaligned memory accesses by requiring the
1956   // natural alignment of buffer accesses.
1957   if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
1958       AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
1959       AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1960     if (!Subtarget->hasRelaxedBufferOOBMode() &&
1961         Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
1962       return false;
1963   }
1964 
1965   // Smaller than dword value must be aligned.
1966   if (Size < 32)
1967     return false;
1968 
1969   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1970   // byte-address are ignored, thus forcing Dword alignment.
1971   // This applies to private, global, and constant memory.
1972   if (IsFast)
1973     *IsFast = 1;
1974 
1975   return Size >= 32 && Alignment >= Align(4);
1976 }
1977 
1978 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1979     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1980     unsigned *IsFast) const {
1981   return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1982                                             Alignment, Flags, IsFast);
1983 }
1984 
1985 EVT SITargetLowering::getOptimalMemOpType(
1986     LLVMContext &Context, const MemOp &Op,
1987     const AttributeList &FuncAttributes) const {
1988   // FIXME: Should account for address space here.
1989 
1990   // The default fallback uses the private pointer size as a guess for a type to
1991   // use. Make sure we switch these to 64-bit accesses.
1992 
1993   if (Op.size() >= 16 &&
1994       Op.isDstAligned(Align(4))) // XXX: Should only do for global
1995     return MVT::v4i32;
1996 
1997   if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1998     return MVT::v2i32;
1999 
2000   // Use the default.
2001   return MVT::Other;
2002 }
2003 
2004 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2005   const MemSDNode *MemNode = cast<MemSDNode>(N);
2006   return MemNode->getMemOperand()->getFlags() & MONoClobber;
2007 }
2008 
2009 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2010   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2011          AS == AMDGPUAS::PRIVATE_ADDRESS;
2012 }
2013 
2014 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2015                                            unsigned DestAS) const {
2016   // Flat -> private/local is a simple truncate.
2017   // Flat -> global is no-op
2018   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2019     return true;
2020 
2021   const GCNTargetMachine &TM =
2022       static_cast<const GCNTargetMachine &>(getTargetMachine());
2023   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2024 }
2025 
2026 TargetLoweringBase::LegalizeTypeAction
2027 SITargetLowering::getPreferredVectorAction(MVT VT) const {
2028   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2029       VT.getScalarType().bitsLE(MVT::i16))
2030     return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2031   return TargetLoweringBase::getPreferredVectorAction(VT);
2032 }
2033 
2034 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2035                                                          Type *Ty) const {
2036   // FIXME: Could be smarter if called for vector constants.
2037   return true;
2038 }
2039 
2040 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2041                                                unsigned Index) const {
2042   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
2043     return false;
2044 
2045   // TODO: Add more cases that are cheap.
2046   return Index == 0;
2047 }
2048 
2049 bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2050   // TODO: This should be more aggressive, particular for 16-bit element
2051   // vectors. However there are some mixed improvements and regressions.
2052   EVT EltTy = VT.getVectorElementType();
2053   return EltTy.getSizeInBits() % 32 == 0;
2054 }
2055 
2056 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2057   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2058     switch (Op) {
2059     case ISD::LOAD:
2060     case ISD::STORE:
2061       return true;
2062     default:
2063       return false;
2064     }
2065   }
2066 
2067   // SimplifySetCC uses this function to determine whether or not it should
2068   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
2069   if (VT == MVT::i1 && Op == ISD::SETCC)
2070     return false;
2071 
2072   return TargetLowering::isTypeDesirableForOp(Op, VT);
2073 }
2074 
2075 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2076                                                    const SDLoc &SL,
2077                                                    SDValue Chain,
2078                                                    uint64_t Offset) const {
2079   const DataLayout &DL = DAG.getDataLayout();
2080   MachineFunction &MF = DAG.getMachineFunction();
2081   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2082   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
2083 
2084   auto [InputPtrReg, RC, ArgTy] =
2085       Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2086 
2087   // We may not have the kernarg segment argument if we have no kernel
2088   // arguments.
2089   if (!InputPtrReg)
2090     return DAG.getConstant(Offset, SL, PtrVT);
2091 
2092   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2093   SDValue BasePtr = DAG.getCopyFromReg(
2094       Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2095 
2096   return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2097 }
2098 
2099 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2100                                             const SDLoc &SL) const {
2101   uint64_t Offset =
2102       getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT);
2103   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2104 }
2105 
2106 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2107                                          const SDLoc &SL) const {
2108 
2109   Function &F = DAG.getMachineFunction().getFunction();
2110   std::optional<uint32_t> KnownSize =
2111       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2112   if (KnownSize.has_value())
2113     return DAG.getConstant(*KnownSize, SL, MVT::i32);
2114   return SDValue();
2115 }
2116 
2117 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2118                                          const SDLoc &SL, SDValue Val,
2119                                          bool Signed,
2120                                          const ISD::InputArg *Arg) const {
2121   // First, if it is a widened vector, narrow it.
2122   if (VT.isVector() &&
2123       VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2124     EVT NarrowedVT =
2125         EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
2126                          VT.getVectorNumElements());
2127     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2128                       DAG.getConstant(0, SL, MVT::i32));
2129   }
2130 
2131   // Then convert the vector elements or scalar value.
2132   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2133     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2134     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2135   }
2136 
2137   if (MemVT.isFloatingPoint())
2138     Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2139   else if (Signed)
2140     Val = DAG.getSExtOrTrunc(Val, SL, VT);
2141   else
2142     Val = DAG.getZExtOrTrunc(Val, SL, VT);
2143 
2144   return Val;
2145 }
2146 
2147 SDValue SITargetLowering::lowerKernargMemParameter(
2148     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2149     uint64_t Offset, Align Alignment, bool Signed,
2150     const ISD::InputArg *Arg) const {
2151   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2152 
2153   // Try to avoid using an extload by loading earlier than the argument address,
2154   // and extracting the relevant bits. The load should hopefully be merged with
2155   // the previous argument.
2156   if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2157     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2158     int64_t AlignDownOffset = alignDown(Offset, 4);
2159     int64_t OffsetDiff = Offset - AlignDownOffset;
2160 
2161     EVT IntVT = MemVT.changeTypeToInteger();
2162 
2163     // TODO: If we passed in the base kernel offset we could have a better
2164     // alignment than 4, but we don't really need it.
2165     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2166     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2167                                MachineMemOperand::MODereferenceable |
2168                                    MachineMemOperand::MOInvariant);
2169 
2170     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2171     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2172 
2173     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2174     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2175     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2176 
2177     return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2178   }
2179 
2180   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2181   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2182                              MachineMemOperand::MODereferenceable |
2183                                  MachineMemOperand::MOInvariant);
2184 
2185   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2186   return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2187 }
2188 
2189 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2190                                               CCValAssign &VA, const SDLoc &SL,
2191                                               SDValue Chain,
2192                                               const ISD::InputArg &Arg) const {
2193   MachineFunction &MF = DAG.getMachineFunction();
2194   MachineFrameInfo &MFI = MF.getFrameInfo();
2195 
2196   if (Arg.Flags.isByVal()) {
2197     unsigned Size = Arg.Flags.getByValSize();
2198     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2199     return DAG.getFrameIndex(FrameIdx, MVT::i32);
2200   }
2201 
2202   unsigned ArgOffset = VA.getLocMemOffset();
2203   unsigned ArgSize = VA.getValVT().getStoreSize();
2204 
2205   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2206 
2207   // Create load nodes to retrieve arguments from the stack.
2208   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2209   SDValue ArgValue;
2210 
2211   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2212   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2213   MVT MemVT = VA.getValVT();
2214 
2215   switch (VA.getLocInfo()) {
2216   default:
2217     break;
2218   case CCValAssign::BCvt:
2219     MemVT = VA.getLocVT();
2220     break;
2221   case CCValAssign::SExt:
2222     ExtType = ISD::SEXTLOAD;
2223     break;
2224   case CCValAssign::ZExt:
2225     ExtType = ISD::ZEXTLOAD;
2226     break;
2227   case CCValAssign::AExt:
2228     ExtType = ISD::EXTLOAD;
2229     break;
2230   }
2231 
2232   ArgValue = DAG.getExtLoad(
2233       ExtType, SL, VA.getLocVT(), Chain, FIN,
2234       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT);
2235   return ArgValue;
2236 }
2237 
2238 SDValue SITargetLowering::getPreloadedValue(
2239     SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2240     AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2241   const ArgDescriptor *Reg = nullptr;
2242   const TargetRegisterClass *RC;
2243   LLT Ty;
2244 
2245   CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2246   const ArgDescriptor WorkGroupIDX =
2247       ArgDescriptor::createRegister(AMDGPU::TTMP9);
2248   // If GridZ is not programmed in an entry function then the hardware will set
2249   // it to all zeros, so there is no need to mask the GridY value in the low
2250   // order bits.
2251   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2252       AMDGPU::TTMP7,
2253       AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2254   const ArgDescriptor WorkGroupIDZ =
2255       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2256   if (Subtarget->hasArchitectedSGPRs() &&
2257       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2258     switch (PVID) {
2259     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2260       Reg = &WorkGroupIDX;
2261       RC = &AMDGPU::SReg_32RegClass;
2262       Ty = LLT::scalar(32);
2263       break;
2264     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2265       Reg = &WorkGroupIDY;
2266       RC = &AMDGPU::SReg_32RegClass;
2267       Ty = LLT::scalar(32);
2268       break;
2269     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2270       Reg = &WorkGroupIDZ;
2271       RC = &AMDGPU::SReg_32RegClass;
2272       Ty = LLT::scalar(32);
2273       break;
2274     default:
2275       break;
2276     }
2277   }
2278 
2279   if (!Reg)
2280     std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2281   if (!Reg) {
2282     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2283       // It's possible for a kernarg intrinsic call to appear in a kernel with
2284       // no allocated segment, in which case we do not add the user sgpr
2285       // argument, so just return null.
2286       return DAG.getConstant(0, SDLoc(), VT);
2287     }
2288 
2289     // It's undefined behavior if a function marked with the amdgpu-no-*
2290     // attributes uses the corresponding intrinsic.
2291     return DAG.getPOISON(VT);
2292   }
2293 
2294   return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2295 }
2296 
2297 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2298                                CallingConv::ID CallConv,
2299                                ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2300                                FunctionType *FType,
2301                                SIMachineFunctionInfo *Info) {
2302   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2303     const ISD::InputArg *Arg = &Ins[I];
2304 
2305     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2306            "vector type argument should have been split");
2307 
2308     // First check if it's a PS input addr.
2309     if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2310         PSInputNum <= 15) {
2311       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2312 
2313       // Inconveniently only the first part of the split is marked as isSplit,
2314       // so skip to the end. We only want to increment PSInputNum once for the
2315       // entire split argument.
2316       if (Arg->Flags.isSplit()) {
2317         while (!Arg->Flags.isSplitEnd()) {
2318           assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2319                  "unexpected vector split in ps argument type");
2320           if (!SkipArg)
2321             Splits.push_back(*Arg);
2322           Arg = &Ins[++I];
2323         }
2324       }
2325 
2326       if (SkipArg) {
2327         // We can safely skip PS inputs.
2328         Skipped.set(Arg->getOrigArgIndex());
2329         ++PSInputNum;
2330         continue;
2331       }
2332 
2333       Info->markPSInputAllocated(PSInputNum);
2334       if (Arg->Used)
2335         Info->markPSInputEnabled(PSInputNum);
2336 
2337       ++PSInputNum;
2338     }
2339 
2340     Splits.push_back(*Arg);
2341   }
2342 }
2343 
2344 // Allocate special inputs passed in VGPRs.
2345 void SITargetLowering::allocateSpecialEntryInputVGPRs(
2346     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2347     SIMachineFunctionInfo &Info) const {
2348   const LLT S32 = LLT::scalar(32);
2349   MachineRegisterInfo &MRI = MF.getRegInfo();
2350 
2351   if (Info.hasWorkItemIDX()) {
2352     Register Reg = AMDGPU::VGPR0;
2353     MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2354 
2355     CCInfo.AllocateReg(Reg);
2356     unsigned Mask =
2357         (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2358     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2359   }
2360 
2361   if (Info.hasWorkItemIDY()) {
2362     assert(Info.hasWorkItemIDX());
2363     if (Subtarget->hasPackedTID()) {
2364       Info.setWorkItemIDY(
2365           ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2366     } else {
2367       unsigned Reg = AMDGPU::VGPR1;
2368       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2369 
2370       CCInfo.AllocateReg(Reg);
2371       Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2372     }
2373   }
2374 
2375   if (Info.hasWorkItemIDZ()) {
2376     assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2377     if (Subtarget->hasPackedTID()) {
2378       Info.setWorkItemIDZ(
2379           ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2380     } else {
2381       unsigned Reg = AMDGPU::VGPR2;
2382       MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2383 
2384       CCInfo.AllocateReg(Reg);
2385       Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2386     }
2387   }
2388 }
2389 
2390 // Try to allocate a VGPR at the end of the argument list, or if no argument
2391 // VGPRs are left allocating a stack slot.
2392 // If \p Mask is is given it indicates bitfield position in the register.
2393 // If \p Arg is given use it with new ]p Mask instead of allocating new.
2394 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2395                                          ArgDescriptor Arg = ArgDescriptor()) {
2396   if (Arg.isSet())
2397     return ArgDescriptor::createArg(Arg, Mask);
2398 
2399   ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2400   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2401   if (RegIdx == ArgVGPRs.size()) {
2402     // Spill to stack required.
2403     int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2404 
2405     return ArgDescriptor::createStack(Offset, Mask);
2406   }
2407 
2408   unsigned Reg = ArgVGPRs[RegIdx];
2409   Reg = CCInfo.AllocateReg(Reg);
2410   assert(Reg != AMDGPU::NoRegister);
2411 
2412   MachineFunction &MF = CCInfo.getMachineFunction();
2413   Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2414   MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2415   return ArgDescriptor::createRegister(Reg, Mask);
2416 }
2417 
2418 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2419                                              const TargetRegisterClass *RC,
2420                                              unsigned NumArgRegs) {
2421   ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2422   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2423   if (RegIdx == ArgSGPRs.size())
2424     report_fatal_error("ran out of SGPRs for arguments");
2425 
2426   unsigned Reg = ArgSGPRs[RegIdx];
2427   Reg = CCInfo.AllocateReg(Reg);
2428   assert(Reg != AMDGPU::NoRegister);
2429 
2430   MachineFunction &MF = CCInfo.getMachineFunction();
2431   MF.addLiveIn(Reg, RC);
2432   return ArgDescriptor::createRegister(Reg);
2433 }
2434 
2435 // If this has a fixed position, we still should allocate the register in the
2436 // CCInfo state. Technically we could get away with this for values passed
2437 // outside of the normal argument range.
2438 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2439                                        const TargetRegisterClass *RC,
2440                                        MCRegister Reg) {
2441   Reg = CCInfo.AllocateReg(Reg);
2442   assert(Reg != AMDGPU::NoRegister);
2443   MachineFunction &MF = CCInfo.getMachineFunction();
2444   MF.addLiveIn(Reg, RC);
2445 }
2446 
2447 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2448   if (Arg) {
2449     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2450                                Arg.getRegister());
2451   } else
2452     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2453 }
2454 
2455 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2456   if (Arg) {
2457     allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2458                                Arg.getRegister());
2459   } else
2460     Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2461 }
2462 
2463 /// Allocate implicit function VGPR arguments at the end of allocated user
2464 /// arguments.
2465 void SITargetLowering::allocateSpecialInputVGPRs(
2466     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2467     SIMachineFunctionInfo &Info) const {
2468   const unsigned Mask = 0x3ff;
2469   ArgDescriptor Arg;
2470 
2471   if (Info.hasWorkItemIDX()) {
2472     Arg = allocateVGPR32Input(CCInfo, Mask);
2473     Info.setWorkItemIDX(Arg);
2474   }
2475 
2476   if (Info.hasWorkItemIDY()) {
2477     Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2478     Info.setWorkItemIDY(Arg);
2479   }
2480 
2481   if (Info.hasWorkItemIDZ())
2482     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2483 }
2484 
2485 /// Allocate implicit function VGPR arguments in fixed registers.
2486 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2487     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2488     SIMachineFunctionInfo &Info) const {
2489   Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2490   if (!Reg)
2491     report_fatal_error("failed to allocate VGPR for implicit arguments");
2492 
2493   const unsigned Mask = 0x3ff;
2494   Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2495   Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2496   Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2497 }
2498 
2499 void SITargetLowering::allocateSpecialInputSGPRs(
2500     CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2501     SIMachineFunctionInfo &Info) const {
2502   auto &ArgInfo = Info.getArgInfo();
2503   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2504 
2505   // TODO: Unify handling with private memory pointers.
2506   if (UserSGPRInfo.hasDispatchPtr())
2507     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2508 
2509   if (UserSGPRInfo.hasQueuePtr())
2510     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2511 
2512   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2513   // constant offset from the kernarg segment.
2514   if (Info.hasImplicitArgPtr())
2515     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2516 
2517   if (UserSGPRInfo.hasDispatchID())
2518     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2519 
2520   // flat_scratch_init is not applicable for non-kernel functions.
2521 
2522   if (Info.hasWorkGroupIDX())
2523     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2524 
2525   if (Info.hasWorkGroupIDY())
2526     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2527 
2528   if (Info.hasWorkGroupIDZ())
2529     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2530 
2531   if (Info.hasLDSKernelId())
2532     allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2533 }
2534 
2535 // Allocate special inputs passed in user SGPRs.
2536 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2537                                             MachineFunction &MF,
2538                                             const SIRegisterInfo &TRI,
2539                                             SIMachineFunctionInfo &Info) const {
2540   const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2541   if (UserSGPRInfo.hasImplicitBufferPtr()) {
2542     Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2543     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2544     CCInfo.AllocateReg(ImplicitBufferPtrReg);
2545   }
2546 
2547   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2548   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2549     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2550     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2551     CCInfo.AllocateReg(PrivateSegmentBufferReg);
2552   }
2553 
2554   if (UserSGPRInfo.hasDispatchPtr()) {
2555     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2556     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2557     CCInfo.AllocateReg(DispatchPtrReg);
2558   }
2559 
2560   if (UserSGPRInfo.hasQueuePtr()) {
2561     Register QueuePtrReg = Info.addQueuePtr(TRI);
2562     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2563     CCInfo.AllocateReg(QueuePtrReg);
2564   }
2565 
2566   if (UserSGPRInfo.hasKernargSegmentPtr()) {
2567     MachineRegisterInfo &MRI = MF.getRegInfo();
2568     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2569     CCInfo.AllocateReg(InputPtrReg);
2570 
2571     Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2572     MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2573   }
2574 
2575   if (UserSGPRInfo.hasDispatchID()) {
2576     Register DispatchIDReg = Info.addDispatchID(TRI);
2577     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2578     CCInfo.AllocateReg(DispatchIDReg);
2579   }
2580 
2581   if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2582     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2583     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2584     CCInfo.AllocateReg(FlatScratchInitReg);
2585   }
2586 
2587   if (UserSGPRInfo.hasPrivateSegmentSize()) {
2588     Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2589     MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2590     CCInfo.AllocateReg(PrivateSegmentSizeReg);
2591   }
2592 
2593   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2594   // these from the dispatch pointer.
2595 }
2596 
2597 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2598 // sequential starting from the first argument.
2599 void SITargetLowering::allocatePreloadKernArgSGPRs(
2600     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2601     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2602     const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2603   Function &F = MF.getFunction();
2604   unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2605   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2606   bool InPreloadSequence = true;
2607   unsigned InIdx = 0;
2608   bool AlignedForImplictArgs = false;
2609   unsigned ImplicitArgOffset = 0;
2610   for (auto &Arg : F.args()) {
2611     if (!InPreloadSequence || !Arg.hasInRegAttr())
2612       break;
2613 
2614     unsigned ArgIdx = Arg.getArgNo();
2615     // Don't preload non-original args or parts not in the current preload
2616     // sequence.
2617     if (InIdx < Ins.size() &&
2618         (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2619       break;
2620 
2621     for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2622            Ins[InIdx].getOrigArgIndex() == ArgIdx;
2623          InIdx++) {
2624       assert(ArgLocs[ArgIdx].isMemLoc());
2625       auto &ArgLoc = ArgLocs[InIdx];
2626       const Align KernelArgBaseAlign = Align(16);
2627       unsigned ArgOffset = ArgLoc.getLocMemOffset();
2628       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2629       unsigned NumAllocSGPRs =
2630           alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2631 
2632       // Fix alignment for hidden arguments.
2633       if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2634         if (!AlignedForImplictArgs) {
2635           ImplicitArgOffset =
2636               alignTo(LastExplicitArgOffset,
2637                       Subtarget->getAlignmentForImplicitArgPtr()) -
2638               LastExplicitArgOffset;
2639           AlignedForImplictArgs = true;
2640         }
2641         ArgOffset += ImplicitArgOffset;
2642       }
2643 
2644       // Arg is preloaded into the previous SGPR.
2645       if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2646         assert(InIdx >= 1 && "No previous SGPR");
2647         Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2648             Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2649         continue;
2650       }
2651 
2652       unsigned Padding = ArgOffset - LastExplicitArgOffset;
2653       unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2654       // Check for free user SGPRs for preloading.
2655       if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2656         InPreloadSequence = false;
2657         break;
2658       }
2659 
2660       // Preload this argument.
2661       const TargetRegisterClass *RC =
2662           TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2663       SmallVectorImpl<MCRegister> *PreloadRegs =
2664           Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2665 
2666       if (PreloadRegs->size() > 1)
2667         RC = &AMDGPU::SGPR_32RegClass;
2668       for (auto &Reg : *PreloadRegs) {
2669         assert(Reg);
2670         MF.addLiveIn(Reg, RC);
2671         CCInfo.AllocateReg(Reg);
2672       }
2673 
2674       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2675     }
2676   }
2677 }
2678 
2679 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2680                                            const SIRegisterInfo &TRI,
2681                                            SIMachineFunctionInfo &Info) const {
2682   // Always allocate this last since it is a synthetic preload.
2683   if (Info.hasLDSKernelId()) {
2684     Register Reg = Info.addLDSKernelId();
2685     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2686     CCInfo.AllocateReg(Reg);
2687   }
2688 }
2689 
2690 // Allocate special input registers that are initialized per-wave.
2691 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
2692                                            SIMachineFunctionInfo &Info,
2693                                            CallingConv::ID CallConv,
2694                                            bool IsShader) const {
2695   bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2696   if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2697     // Note: user SGPRs are handled by the front-end for graphics shaders
2698     // Pad up the used user SGPRs with dead inputs.
2699 
2700     // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2701     // before enabling architected SGPRs for workgroup IDs.
2702     assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2703 
2704     unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2705     // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2706     // rely on it to reach 16 since if we end up having no stack usage, it will
2707     // not really be added.
2708     unsigned NumRequiredSystemSGPRs =
2709         Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2710         Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2711     for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2712       Register Reg = Info.addReservedUserSGPR();
2713       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2714       CCInfo.AllocateReg(Reg);
2715     }
2716   }
2717 
2718   if (!HasArchitectedSGPRs) {
2719     if (Info.hasWorkGroupIDX()) {
2720       Register Reg = Info.addWorkGroupIDX();
2721       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2722       CCInfo.AllocateReg(Reg);
2723     }
2724 
2725     if (Info.hasWorkGroupIDY()) {
2726       Register Reg = Info.addWorkGroupIDY();
2727       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2728       CCInfo.AllocateReg(Reg);
2729     }
2730 
2731     if (Info.hasWorkGroupIDZ()) {
2732       Register Reg = Info.addWorkGroupIDZ();
2733       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2734       CCInfo.AllocateReg(Reg);
2735     }
2736   }
2737 
2738   if (Info.hasWorkGroupInfo()) {
2739     Register Reg = Info.addWorkGroupInfo();
2740     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2741     CCInfo.AllocateReg(Reg);
2742   }
2743 
2744   if (Info.hasPrivateSegmentWaveByteOffset()) {
2745     // Scratch wave offset passed in system SGPR.
2746     unsigned PrivateSegmentWaveByteOffsetReg;
2747 
2748     if (IsShader) {
2749       PrivateSegmentWaveByteOffsetReg =
2750           Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2751 
2752       // This is true if the scratch wave byte offset doesn't have a fixed
2753       // location.
2754       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2755         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2756         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2757       }
2758     } else
2759       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2760 
2761     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2762     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2763   }
2764 
2765   assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2766          Info.getNumPreloadedSGPRs() >= 16);
2767 }
2768 
2769 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2770                                      MachineFunction &MF,
2771                                      const SIRegisterInfo &TRI,
2772                                      SIMachineFunctionInfo &Info) {
2773   // Now that we've figured out where the scratch register inputs are, see if
2774   // should reserve the arguments and use them directly.
2775   MachineFrameInfo &MFI = MF.getFrameInfo();
2776   bool HasStackObjects = MFI.hasStackObjects();
2777   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2778 
2779   // Record that we know we have non-spill stack objects so we don't need to
2780   // check all stack objects later.
2781   if (HasStackObjects)
2782     Info.setHasNonSpillStackObjects(true);
2783 
2784   // Everything live out of a block is spilled with fast regalloc, so it's
2785   // almost certain that spilling will be required.
2786   if (TM.getOptLevel() == CodeGenOptLevel::None)
2787     HasStackObjects = true;
2788 
2789   // For now assume stack access is needed in any callee functions, so we need
2790   // the scratch registers to pass in.
2791   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2792 
2793   if (!ST.enableFlatScratch()) {
2794     if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2795       // If we have stack objects, we unquestionably need the private buffer
2796       // resource. For the Code Object V2 ABI, this will be the first 4 user
2797       // SGPR inputs. We can reserve those and use them directly.
2798 
2799       Register PrivateSegmentBufferReg =
2800           Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2801       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2802     } else {
2803       unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2804       // We tentatively reserve the last registers (skipping the last registers
2805       // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2806       // we'll replace these with the ones immediately after those which were
2807       // really allocated. In the prologue copies will be inserted from the
2808       // argument to these reserved registers.
2809 
2810       // Without HSA, relocations are used for the scratch pointer and the
2811       // buffer resource setup is always inserted in the prologue. Scratch wave
2812       // offset is still in an input SGPR.
2813       Info.setScratchRSrcReg(ReservedBufferReg);
2814     }
2815   }
2816 
2817   MachineRegisterInfo &MRI = MF.getRegInfo();
2818 
2819   // For entry functions we have to set up the stack pointer if we use it,
2820   // whereas non-entry functions get this "for free". This means there is no
2821   // intrinsic advantage to using S32 over S34 in cases where we do not have
2822   // calls but do need a frame pointer (i.e. if we are requested to have one
2823   // because frame pointer elimination is disabled). To keep things simple we
2824   // only ever use S32 as the call ABI stack pointer, and so using it does not
2825   // imply we need a separate frame pointer.
2826   //
2827   // Try to use s32 as the SP, but move it if it would interfere with input
2828   // arguments. This won't work with calls though.
2829   //
2830   // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2831   // registers.
2832   if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2833     Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2834   } else {
2835     assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2836 
2837     if (MFI.hasCalls())
2838       report_fatal_error("call in graphics shader with too many input SGPRs");
2839 
2840     for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2841       if (!MRI.isLiveIn(Reg)) {
2842         Info.setStackPtrOffsetReg(Reg);
2843         break;
2844       }
2845     }
2846 
2847     if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2848       report_fatal_error("failed to find register for SP");
2849   }
2850 
2851   // hasFP should be accurate for entry functions even before the frame is
2852   // finalized, because it does not rely on the known stack size, only
2853   // properties like whether variable sized objects are present.
2854   if (ST.getFrameLowering()->hasFP(MF)) {
2855     Info.setFrameOffsetReg(AMDGPU::SGPR33);
2856   }
2857 }
2858 
2859 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2860   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2861   return !Info->isEntryFunction();
2862 }
2863 
2864 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
2865 
2866 void SITargetLowering::insertCopiesSplitCSR(
2867     MachineBasicBlock *Entry,
2868     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2869   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2870 
2871   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2872   if (!IStart)
2873     return;
2874 
2875   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2876   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2877   MachineBasicBlock::iterator MBBI = Entry->begin();
2878   for (const MCPhysReg *I = IStart; *I; ++I) {
2879     const TargetRegisterClass *RC = nullptr;
2880     if (AMDGPU::SReg_64RegClass.contains(*I))
2881       RC = &AMDGPU::SGPR_64RegClass;
2882     else if (AMDGPU::SReg_32RegClass.contains(*I))
2883       RC = &AMDGPU::SGPR_32RegClass;
2884     else
2885       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2886 
2887     Register NewVR = MRI->createVirtualRegister(RC);
2888     // Create copy from CSR to a virtual register.
2889     Entry->addLiveIn(*I);
2890     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2891         .addReg(*I);
2892 
2893     // Insert the copy-back instructions right before the terminator.
2894     for (auto *Exit : Exits)
2895       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2896               TII->get(TargetOpcode::COPY), *I)
2897           .addReg(NewVR);
2898   }
2899 }
2900 
2901 SDValue SITargetLowering::LowerFormalArguments(
2902     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2903     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2904     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2905   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2906 
2907   MachineFunction &MF = DAG.getMachineFunction();
2908   const Function &Fn = MF.getFunction();
2909   FunctionType *FType = MF.getFunction().getFunctionType();
2910   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2911   bool IsError = false;
2912 
2913   if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2914     DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2915         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
2916     IsError = true;
2917   }
2918 
2919   SmallVector<ISD::InputArg, 16> Splits;
2920   SmallVector<CCValAssign, 16> ArgLocs;
2921   BitVector Skipped(Ins.size());
2922   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2923                  *DAG.getContext());
2924 
2925   bool IsGraphics = AMDGPU::isGraphics(CallConv);
2926   bool IsKernel = AMDGPU::isKernel(CallConv);
2927   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2928 
2929   if (IsGraphics) {
2930     const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2931     assert(!UserSGPRInfo.hasDispatchPtr() &&
2932            !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2933            !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2934            !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2935     (void)UserSGPRInfo;
2936     if (!Subtarget->enableFlatScratch())
2937       assert(!UserSGPRInfo.hasFlatScratchInit());
2938     if ((CallConv != CallingConv::AMDGPU_CS &&
2939          CallConv != CallingConv::AMDGPU_Gfx) ||
2940         !Subtarget->hasArchitectedSGPRs())
2941       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2942              !Info->hasWorkGroupIDZ());
2943   }
2944 
2945   if (CallConv == CallingConv::AMDGPU_PS) {
2946     processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2947 
2948     // At least one interpolation mode must be enabled or else the GPU will
2949     // hang.
2950     //
2951     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2952     // set PSInputAddr, the user wants to enable some bits after the compilation
2953     // based on run-time states. Since we can't know what the final PSInputEna
2954     // will look like, so we shouldn't do anything here and the user should take
2955     // responsibility for the correct programming.
2956     //
2957     // Otherwise, the following restrictions apply:
2958     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2959     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2960     //   enabled too.
2961     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2962         ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2963       CCInfo.AllocateReg(AMDGPU::VGPR0);
2964       CCInfo.AllocateReg(AMDGPU::VGPR1);
2965       Info->markPSInputAllocated(0);
2966       Info->markPSInputEnabled(0);
2967     }
2968     if (Subtarget->isAmdPalOS()) {
2969       // For isAmdPalOS, the user does not enable some bits after compilation
2970       // based on run-time states; the register values being generated here are
2971       // the final ones set in hardware. Therefore we need to apply the
2972       // workaround to PSInputAddr and PSInputEnable together.  (The case where
2973       // a bit is set in PSInputAddr but not PSInputEnable is where the
2974       // frontend set up an input arg for a particular interpolation mode, but
2975       // nothing uses that input arg. Really we should have an earlier pass
2976       // that removes such an arg.)
2977       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2978       if ((PsInputBits & 0x7F) == 0 ||
2979           ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2980         Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2981     }
2982   } else if (IsKernel) {
2983     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2984   } else {
2985     Splits.append(Ins.begin(), Ins.end());
2986   }
2987 
2988   if (IsKernel)
2989     analyzeFormalArgumentsCompute(CCInfo, Ins);
2990 
2991   if (IsEntryFunc) {
2992     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2993     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2994     if (IsKernel && Subtarget->hasKernargPreload())
2995       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2996 
2997     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2998   } else if (!IsGraphics) {
2999     // For the fixed ABI, pass workitem IDs in the last argument register.
3000     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3001 
3002     // FIXME: Sink this into allocateSpecialInputSGPRs
3003     if (!Subtarget->enableFlatScratch())
3004       CCInfo.AllocateReg(Info->getScratchRSrcReg());
3005 
3006     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3007   }
3008 
3009   if (!IsKernel) {
3010     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3011     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3012   }
3013 
3014   SmallVector<SDValue, 16> Chains;
3015 
3016   // FIXME: This is the minimum kernel argument alignment. We should improve
3017   // this to the maximum alignment of the arguments.
3018   //
3019   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3020   // kern arg offset.
3021   const Align KernelArgBaseAlign = Align(16);
3022 
3023   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
3024     const ISD::InputArg &Arg = Ins[i];
3025     if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3026       InVals.push_back(DAG.getPOISON(Arg.VT));
3027       continue;
3028     }
3029 
3030     CCValAssign &VA = ArgLocs[ArgIdx++];
3031     MVT VT = VA.getLocVT();
3032 
3033     if (IsEntryFunc && VA.isMemLoc()) {
3034       VT = Ins[i].VT;
3035       EVT MemVT = VA.getLocVT();
3036 
3037       const uint64_t Offset = VA.getLocMemOffset();
3038       Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3039 
3040       if (Arg.Flags.isByRef()) {
3041         SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3042 
3043         const GCNTargetMachine &TM =
3044             static_cast<const GCNTargetMachine &>(getTargetMachine());
3045         if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3046                                     Arg.Flags.getPointerAddrSpace())) {
3047           Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
3048                                      Arg.Flags.getPointerAddrSpace());
3049         }
3050 
3051         InVals.push_back(Ptr);
3052         continue;
3053       }
3054 
3055       SDValue NewArg;
3056       if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3057         if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3058           // In this case the argument is packed into the previous preload SGPR.
3059           int64_t AlignDownOffset = alignDown(Offset, 4);
3060           int64_t OffsetDiff = Offset - AlignDownOffset;
3061           EVT IntVT = MemVT.changeTypeToInteger();
3062 
3063           const SIMachineFunctionInfo *Info =
3064               MF.getInfo<SIMachineFunctionInfo>();
3065           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3066           Register Reg =
3067               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3068 
3069           assert(Reg);
3070           Register VReg = MRI.getLiveInVirtReg(Reg);
3071           SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3072 
3073           SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3074           SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3075 
3076           SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3077           ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3078           NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3079                                   Ins[i].Flags.isSExt(), &Ins[i]);
3080 
3081           NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3082         } else {
3083           const SIMachineFunctionInfo *Info =
3084               MF.getInfo<SIMachineFunctionInfo>();
3085           MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3086           const SmallVectorImpl<MCRegister> &PreloadRegs =
3087               Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3088 
3089           SDValue Copy;
3090           if (PreloadRegs.size() == 1) {
3091             Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3092             const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3093             NewArg = DAG.getCopyFromReg(
3094                 Chain, DL, VReg,
3095                 EVT::getIntegerVT(*DAG.getContext(),
3096                                   TRI->getRegSizeInBits(*RC)));
3097 
3098           } else {
3099             // If the kernarg alignment does not match the alignment of the SGPR
3100             // tuple RC that can accommodate this argument, it will be built up
3101             // via copies from from the individual SGPRs that the argument was
3102             // preloaded to.
3103             SmallVector<SDValue, 4> Elts;
3104             for (auto Reg : PreloadRegs) {
3105               Register VReg = MRI.getLiveInVirtReg(Reg);
3106               Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3107               Elts.push_back(Copy);
3108             }
3109             NewArg =
3110                 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3111                                                     PreloadRegs.size()),
3112                                    DL, Elts);
3113           }
3114 
3115           // If the argument was preloaded to multiple consecutive 32-bit
3116           // registers because of misalignment between addressable SGPR tuples
3117           // and the argument size, we can still assume that because of kernarg
3118           // segment alignment restrictions that NewArg's size is the same as
3119           // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3120           // truncate since we cannot preload to less than a single SGPR and the
3121           // MemVT may be smaller.
3122           EVT MemVTInt =
3123               EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
3124           if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3125             NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3126 
3127           NewArg = DAG.getBitcast(MemVT, NewArg);
3128           NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3129                                   Ins[i].Flags.isSExt(), &Ins[i]);
3130           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3131         }
3132       } else {
3133         // Hidden arguments that are in the kernel signature must be preloaded
3134         // to user SGPRs. Print a diagnostic error if a hidden argument is in
3135         // the argument list and is not preloaded.
3136         if (Arg.isOrigArg()) {
3137           Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3138           if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3139             DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3140                 *OrigArg->getParent(),
3141                 "hidden argument in kernel signature was not preloaded",
3142                 DL.getDebugLoc()));
3143           }
3144         }
3145 
3146         NewArg =
3147             lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3148                                      Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3149       }
3150       Chains.push_back(NewArg.getValue(1));
3151 
3152       auto *ParamTy =
3153           dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3154       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3155           ParamTy &&
3156           (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3157            ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3158         // On SI local pointers are just offsets into LDS, so they are always
3159         // less than 16-bits.  On CI and newer they could potentially be
3160         // real pointers, so we can't guarantee their size.
3161         NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3162                              DAG.getValueType(MVT::i16));
3163       }
3164 
3165       InVals.push_back(NewArg);
3166       continue;
3167     }
3168     if (!IsEntryFunc && VA.isMemLoc()) {
3169       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3170       InVals.push_back(Val);
3171       if (!Arg.Flags.isByVal())
3172         Chains.push_back(Val.getValue(1));
3173       continue;
3174     }
3175 
3176     assert(VA.isRegLoc() && "Parameter must be in a register!");
3177 
3178     Register Reg = VA.getLocReg();
3179     const TargetRegisterClass *RC = nullptr;
3180     if (AMDGPU::VGPR_32RegClass.contains(Reg))
3181       RC = &AMDGPU::VGPR_32RegClass;
3182     else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3183       RC = &AMDGPU::SGPR_32RegClass;
3184     else
3185       llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3186     EVT ValVT = VA.getValVT();
3187 
3188     Reg = MF.addLiveIn(Reg, RC);
3189     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3190 
3191     if (Arg.Flags.isSRet()) {
3192       // The return object should be reasonably addressable.
3193 
3194       // FIXME: This helps when the return is a real sret. If it is a
3195       // automatically inserted sret (i.e. CanLowerReturn returns false), an
3196       // extra copy is inserted in SelectionDAGBuilder which obscures this.
3197       unsigned NumBits =
3198           32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3199       Val = DAG.getNode(
3200           ISD::AssertZext, DL, VT, Val,
3201           DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3202     }
3203 
3204     // If this is an 8 or 16-bit value, it is really passed promoted
3205     // to 32 bits. Insert an assert[sz]ext to capture this, then
3206     // truncate to the right size.
3207     switch (VA.getLocInfo()) {
3208     case CCValAssign::Full:
3209       break;
3210     case CCValAssign::BCvt:
3211       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3212       break;
3213     case CCValAssign::SExt:
3214       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3215       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3216       break;
3217     case CCValAssign::ZExt:
3218       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3219       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3220       break;
3221     case CCValAssign::AExt:
3222       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3223       break;
3224     default:
3225       llvm_unreachable("Unknown loc info!");
3226     }
3227 
3228     InVals.push_back(Val);
3229   }
3230 
3231   // Start adding system SGPRs.
3232   if (IsEntryFunc)
3233     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3234 
3235   // DAG.getPass() returns nullptr when using new pass manager.
3236   // TODO: Use DAG.getMFAM() to access analysis result.
3237   if (DAG.getPass()) {
3238     auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3239     ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3240   }
3241 
3242   unsigned StackArgSize = CCInfo.getStackSize();
3243   Info->setBytesInStackArgArea(StackArgSize);
3244 
3245   return Chains.empty() ? Chain
3246                         : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3247 }
3248 
3249 // TODO: If return values can't fit in registers, we should return as many as
3250 // possible in registers before passing on stack.
3251 bool SITargetLowering::CanLowerReturn(
3252     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3253     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3254     const Type *RetTy) const {
3255   // Replacing returns with sret/stack usage doesn't make sense for shaders.
3256   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3257   // for shaders. Vector types should be explicitly handled by CC.
3258   if (AMDGPU::isEntryFunctionCC(CallConv))
3259     return true;
3260 
3261   SmallVector<CCValAssign, 16> RVLocs;
3262   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3263   if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3264     return false;
3265 
3266   // We must use the stack if return would require unavailable registers.
3267   unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3268   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3269   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3270     if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3271       return false;
3272 
3273   return true;
3274 }
3275 
3276 SDValue
3277 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3278                               bool isVarArg,
3279                               const SmallVectorImpl<ISD::OutputArg> &Outs,
3280                               const SmallVectorImpl<SDValue> &OutVals,
3281                               const SDLoc &DL, SelectionDAG &DAG) const {
3282   MachineFunction &MF = DAG.getMachineFunction();
3283   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3284   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3285 
3286   if (AMDGPU::isKernel(CallConv)) {
3287     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3288                                              OutVals, DL, DAG);
3289   }
3290 
3291   bool IsShader = AMDGPU::isShader(CallConv);
3292 
3293   Info->setIfReturnsVoid(Outs.empty());
3294   bool IsWaveEnd = Info->returnsVoid() && IsShader;
3295 
3296   // CCValAssign - represent the assignment of the return value to a location.
3297   SmallVector<CCValAssign, 48> RVLocs;
3298 
3299   // CCState - Info about the registers and stack slots.
3300   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3301                  *DAG.getContext());
3302 
3303   // Analyze outgoing return values.
3304   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3305 
3306   SDValue Glue;
3307   SmallVector<SDValue, 48> RetOps;
3308   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3309 
3310   SDValue ReadFirstLane =
3311       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3312   // Copy the result values into the output registers.
3313   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3314        ++I, ++RealRVLocIdx) {
3315     CCValAssign &VA = RVLocs[I];
3316     assert(VA.isRegLoc() && "Can only return in registers!");
3317     // TODO: Partially return in registers if return values don't fit.
3318     SDValue Arg = OutVals[RealRVLocIdx];
3319 
3320     // Copied from other backends.
3321     switch (VA.getLocInfo()) {
3322     case CCValAssign::Full:
3323       break;
3324     case CCValAssign::BCvt:
3325       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3326       break;
3327     case CCValAssign::SExt:
3328       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3329       break;
3330     case CCValAssign::ZExt:
3331       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3332       break;
3333     case CCValAssign::AExt:
3334       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3335       break;
3336     default:
3337       llvm_unreachable("Unknown loc info!");
3338     }
3339     if (TRI->isSGPRPhysReg(VA.getLocReg()))
3340       Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
3341                         ReadFirstLane, Arg);
3342     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3343     Glue = Chain.getValue(1);
3344     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3345   }
3346 
3347   // FIXME: Does sret work properly?
3348   if (!Info->isEntryFunction()) {
3349     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3350     const MCPhysReg *I =
3351         TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3352     if (I) {
3353       for (; *I; ++I) {
3354         if (AMDGPU::SReg_64RegClass.contains(*I))
3355           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3356         else if (AMDGPU::SReg_32RegClass.contains(*I))
3357           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3358         else
3359           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3360       }
3361     }
3362   }
3363 
3364   // Update chain and glue.
3365   RetOps[0] = Chain;
3366   if (Glue.getNode())
3367     RetOps.push_back(Glue);
3368 
3369   unsigned Opc = AMDGPUISD::ENDPGM;
3370   if (!IsWaveEnd)
3371     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3372   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3373 }
3374 
3375 SDValue SITargetLowering::LowerCallResult(
3376     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3377     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3378     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3379     SDValue ThisVal) const {
3380   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3381 
3382   // Assign locations to each value returned by this call.
3383   SmallVector<CCValAssign, 16> RVLocs;
3384   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3385                  *DAG.getContext());
3386   CCInfo.AnalyzeCallResult(Ins, RetCC);
3387 
3388   // Copy all of the result registers out of their specified physreg.
3389   for (CCValAssign VA : RVLocs) {
3390     SDValue Val;
3391 
3392     if (VA.isRegLoc()) {
3393       Val =
3394           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3395       Chain = Val.getValue(1);
3396       InGlue = Val.getValue(2);
3397     } else if (VA.isMemLoc()) {
3398       report_fatal_error("TODO: return values in memory");
3399     } else
3400       llvm_unreachable("unknown argument location type");
3401 
3402     switch (VA.getLocInfo()) {
3403     case CCValAssign::Full:
3404       break;
3405     case CCValAssign::BCvt:
3406       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3407       break;
3408     case CCValAssign::ZExt:
3409       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3410                         DAG.getValueType(VA.getValVT()));
3411       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3412       break;
3413     case CCValAssign::SExt:
3414       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3415                         DAG.getValueType(VA.getValVT()));
3416       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3417       break;
3418     case CCValAssign::AExt:
3419       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3420       break;
3421     default:
3422       llvm_unreachable("Unknown loc info!");
3423     }
3424 
3425     InVals.push_back(Val);
3426   }
3427 
3428   return Chain;
3429 }
3430 
3431 // Add code to pass special inputs required depending on used features separate
3432 // from the explicit user arguments present in the IR.
3433 void SITargetLowering::passSpecialInputs(
3434     CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3435     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3436     SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3437   // If we don't have a call site, this was a call inserted by
3438   // legalization. These can never use special inputs.
3439   if (!CLI.CB)
3440     return;
3441 
3442   SelectionDAG &DAG = CLI.DAG;
3443   const SDLoc &DL = CLI.DL;
3444   const Function &F = DAG.getMachineFunction().getFunction();
3445 
3446   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3447   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3448 
3449   const AMDGPUFunctionArgInfo *CalleeArgInfo =
3450       &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3451   if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3452     // DAG.getPass() returns nullptr when using new pass manager.
3453     // TODO: Use DAG.getMFAM() to access analysis result.
3454     if (DAG.getPass()) {
3455       auto &ArgUsageInfo =
3456           DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3457       CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3458     }
3459   }
3460 
3461   // TODO: Unify with private memory register handling. This is complicated by
3462   // the fact that at least in kernels, the input argument is not necessarily
3463   // in the same location as the input.
3464   // clang-format off
3465   static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3466                               StringLiteral> ImplicitAttrs[] = {
3467      {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3468      {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3469      {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3470      {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3471      {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3472      {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3473      {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3474      {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3475    };
3476   // clang-format on
3477 
3478   for (auto [InputID, Attr] : ImplicitAttrs) {
3479     // If the callee does not use the attribute value, skip copying the value.
3480     if (CLI.CB->hasFnAttr(Attr))
3481       continue;
3482 
3483     const auto [OutgoingArg, ArgRC, ArgTy] =
3484         CalleeArgInfo->getPreloadedValue(InputID);
3485     if (!OutgoingArg)
3486       continue;
3487 
3488     const auto [IncomingArg, IncomingArgRC, Ty] =
3489         CallerArgInfo.getPreloadedValue(InputID);
3490     assert(IncomingArgRC == ArgRC);
3491 
3492     // All special arguments are ints for now.
3493     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3494     SDValue InputReg;
3495 
3496     if (IncomingArg) {
3497       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3498     } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3499       // The implicit arg ptr is special because it doesn't have a corresponding
3500       // input for kernels, and is computed from the kernarg segment pointer.
3501       InputReg = getImplicitArgPtr(DAG, DL);
3502     } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3503       std::optional<uint32_t> Id =
3504           AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3505       if (Id.has_value()) {
3506         InputReg = DAG.getConstant(*Id, DL, ArgVT);
3507       } else {
3508         InputReg = DAG.getPOISON(ArgVT);
3509       }
3510     } else {
3511       // We may have proven the input wasn't needed, although the ABI is
3512       // requiring it. We just need to allocate the register appropriately.
3513       InputReg = DAG.getPOISON(ArgVT);
3514     }
3515 
3516     if (OutgoingArg->isRegister()) {
3517       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3518       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3519         report_fatal_error("failed to allocate implicit input argument");
3520     } else {
3521       unsigned SpecialArgOffset =
3522           CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3523       SDValue ArgStore =
3524           storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3525       MemOpChains.push_back(ArgStore);
3526     }
3527   }
3528 
3529   // Pack workitem IDs into a single register or pass it as is if already
3530   // packed.
3531 
3532   auto [OutgoingArg, ArgRC, Ty] =
3533       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3534   if (!OutgoingArg)
3535     std::tie(OutgoingArg, ArgRC, Ty) =
3536         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3537   if (!OutgoingArg)
3538     std::tie(OutgoingArg, ArgRC, Ty) =
3539         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3540   if (!OutgoingArg)
3541     return;
3542 
3543   const ArgDescriptor *IncomingArgX = std::get<0>(
3544       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3545   const ArgDescriptor *IncomingArgY = std::get<0>(
3546       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3547   const ArgDescriptor *IncomingArgZ = std::get<0>(
3548       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3549 
3550   SDValue InputReg;
3551   SDLoc SL;
3552 
3553   const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3554   const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3555   const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3556 
3557   // If incoming ids are not packed we need to pack them.
3558   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3559       NeedWorkItemIDX) {
3560     if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3561       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3562     } else {
3563       InputReg = DAG.getConstant(0, DL, MVT::i32);
3564     }
3565   }
3566 
3567   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3568       NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3569     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3570     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3571                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
3572     InputReg = InputReg.getNode()
3573                    ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3574                    : Y;
3575   }
3576 
3577   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3578       NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3579     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3580     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3581                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
3582     InputReg = InputReg.getNode()
3583                    ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3584                    : Z;
3585   }
3586 
3587   if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3588     if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3589       // We're in a situation where the outgoing function requires the workitem
3590       // ID, but the calling function does not have it (e.g a graphics function
3591       // calling a C calling convention function). This is illegal, but we need
3592       // to produce something.
3593       InputReg = DAG.getPOISON(MVT::i32);
3594     } else {
3595       // Workitem ids are already packed, any of present incoming arguments
3596       // will carry all required fields.
3597       ArgDescriptor IncomingArg =
3598           ArgDescriptor::createArg(IncomingArgX   ? *IncomingArgX
3599                                    : IncomingArgY ? *IncomingArgY
3600                                                   : *IncomingArgZ,
3601                                    ~0u);
3602       InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3603     }
3604   }
3605 
3606   if (OutgoingArg->isRegister()) {
3607     if (InputReg)
3608       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3609 
3610     CCInfo.AllocateReg(OutgoingArg->getRegister());
3611   } else {
3612     unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3613     if (InputReg) {
3614       SDValue ArgStore =
3615           storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3616       MemOpChains.push_back(ArgStore);
3617     }
3618   }
3619 }
3620 
3621 bool SITargetLowering::isEligibleForTailCallOptimization(
3622     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3623     const SmallVectorImpl<ISD::OutputArg> &Outs,
3624     const SmallVectorImpl<SDValue> &OutVals,
3625     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3626   if (AMDGPU::isChainCC(CalleeCC))
3627     return true;
3628 
3629   if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3630     return false;
3631 
3632   // For a divergent call target, we need to do a waterfall loop over the
3633   // possible callees which precludes us from using a simple jump.
3634   if (Callee->isDivergent())
3635     return false;
3636 
3637   MachineFunction &MF = DAG.getMachineFunction();
3638   const Function &CallerF = MF.getFunction();
3639   CallingConv::ID CallerCC = CallerF.getCallingConv();
3640   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3641   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3642 
3643   // Kernels aren't callable, and don't have a live in return address so it
3644   // doesn't make sense to do a tail call with entry functions.
3645   if (!CallerPreserved)
3646     return false;
3647 
3648   bool CCMatch = CallerCC == CalleeCC;
3649 
3650   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3651     if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3652       return true;
3653     return false;
3654   }
3655 
3656   // TODO: Can we handle var args?
3657   if (IsVarArg)
3658     return false;
3659 
3660   for (const Argument &Arg : CallerF.args()) {
3661     if (Arg.hasByValAttr())
3662       return false;
3663   }
3664 
3665   LLVMContext &Ctx = *DAG.getContext();
3666 
3667   // Check that the call results are passed in the same way.
3668   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3669                                   CCAssignFnForCall(CalleeCC, IsVarArg),
3670                                   CCAssignFnForCall(CallerCC, IsVarArg)))
3671     return false;
3672 
3673   // The callee has to preserve all registers the caller needs to preserve.
3674   if (!CCMatch) {
3675     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3676     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3677       return false;
3678   }
3679 
3680   // Nothing more to check if the callee is taking no arguments.
3681   if (Outs.empty())
3682     return true;
3683 
3684   SmallVector<CCValAssign, 16> ArgLocs;
3685   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3686 
3687   // FIXME: We are not allocating special input registers, so we will be
3688   // deciding based on incorrect register assignments.
3689   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3690 
3691   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3692   // If the stack arguments for this call do not fit into our own save area then
3693   // the call cannot be made tail.
3694   // TODO: Is this really necessary?
3695   if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3696     return false;
3697 
3698   for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3699     // FIXME: What about inreg arguments that end up passed in memory?
3700     if (!CCVA.isRegLoc())
3701       continue;
3702 
3703     // If we are passing an argument in an SGPR, and the value is divergent,
3704     // this call requires a waterfall loop.
3705     if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3706       LLVM_DEBUG(
3707           dbgs() << "Cannot tail call due to divergent outgoing argument in "
3708                  << printReg(CCVA.getLocReg(), TRI) << '\n');
3709       return false;
3710     }
3711   }
3712 
3713   const MachineRegisterInfo &MRI = MF.getRegInfo();
3714   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3715 }
3716 
3717 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3718   if (!CI->isTailCall())
3719     return false;
3720 
3721   const Function *ParentFn = CI->getParent()->getParent();
3722   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3723     return false;
3724   return true;
3725 }
3726 
3727 namespace {
3728 // Chain calls have special arguments that we need to handle. These are
3729 // tagging along at the end of the arguments list(s), after the SGPR and VGPR
3730 // arguments (index 0 and 1 respectively).
3731 enum ChainCallArgIdx {
3732   Exec = 2,
3733   Flags,
3734   NumVGPRs,
3735   FallbackExec,
3736   FallbackCallee
3737 };
3738 } // anonymous namespace
3739 
3740 // The wave scratch offset register is used as the global base pointer.
3741 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3742                                     SmallVectorImpl<SDValue> &InVals) const {
3743   CallingConv::ID CallConv = CLI.CallConv;
3744   bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3745 
3746   SelectionDAG &DAG = CLI.DAG;
3747 
3748   const SDLoc &DL = CLI.DL;
3749   SDValue Chain = CLI.Chain;
3750   SDValue Callee = CLI.Callee;
3751 
3752   llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3753   bool UsesDynamicVGPRs = false;
3754   if (IsChainCallConv) {
3755     // The last arguments should be the value that we need to put in EXEC,
3756     // followed by the flags and any other arguments with special meanings.
3757     // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3758     // we don't treat them like the "real" arguments.
3759     auto RequestedExecIt =
3760         llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3761           return Arg.OrigArgIndex == 2;
3762         });
3763     assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3764 
3765     size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3766     CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3767                       CLI.OutVals.end());
3768     CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3769 
3770     assert(CLI.Outs.back().OrigArgIndex < 2 &&
3771            "Haven't popped all the special args");
3772 
3773     TargetLowering::ArgListEntry RequestedExecArg =
3774         CLI.Args[ChainCallArgIdx::Exec];
3775     if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3776       return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3777 
3778     // Convert constants into TargetConstants, so they become immediate operands
3779     // instead of being selected into S_MOV.
3780     auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3781       if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3782         ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3783             ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3784       } else
3785         ChainCallSpecialArgs.push_back(Arg.Node);
3786     };
3787 
3788     PushNodeOrTargetConstant(RequestedExecArg);
3789 
3790     // Process any other special arguments depending on the value of the flags.
3791     TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3792 
3793     const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3794     if (FlagsValue.isZero()) {
3795       if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3796         return lowerUnhandledCall(CLI, InVals,
3797                                   "no additional args allowed if flags == 0");
3798     } else if (FlagsValue.isOneBitSet(0)) {
3799       if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3800         return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3801       }
3802 
3803       if (!Subtarget->isWave32()) {
3804         return lowerUnhandledCall(
3805             CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3806       }
3807 
3808       UsesDynamicVGPRs = true;
3809       std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3810                     CLI.Args.end(), PushNodeOrTargetConstant);
3811     }
3812   }
3813 
3814   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3815   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3816   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3817   bool &IsTailCall = CLI.IsTailCall;
3818   bool IsVarArg = CLI.IsVarArg;
3819   bool IsSibCall = false;
3820   MachineFunction &MF = DAG.getMachineFunction();
3821 
3822   if (Callee.isUndef() || isNullConstant(Callee)) {
3823     if (!CLI.IsTailCall) {
3824       for (ISD::InputArg &Arg : CLI.Ins)
3825         InVals.push_back(DAG.getPOISON(Arg.VT));
3826     }
3827 
3828     return Chain;
3829   }
3830 
3831   if (IsVarArg) {
3832     return lowerUnhandledCall(CLI, InVals,
3833                               "unsupported call to variadic function ");
3834   }
3835 
3836   if (!CLI.CB)
3837     return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
3838 
3839   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3840     return lowerUnhandledCall(CLI, InVals,
3841                               "unsupported required tail call to function ");
3842   }
3843 
3844   if (IsTailCall) {
3845     IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
3846                                                    Outs, OutVals, Ins, DAG);
3847     if (!IsTailCall &&
3848         ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3849       report_fatal_error("failed to perform tail call elimination on a call "
3850                          "site marked musttail or on llvm.amdgcn.cs.chain");
3851     }
3852 
3853     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3854 
3855     // A sibling call is one where we're under the usual C ABI and not planning
3856     // to change that but can still do a tail call:
3857     if (!TailCallOpt && IsTailCall)
3858       IsSibCall = true;
3859 
3860     if (IsTailCall)
3861       ++NumTailCalls;
3862   }
3863 
3864   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3865   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3866   SmallVector<SDValue, 8> MemOpChains;
3867 
3868   // Analyze operands of the call, assigning locations to each operand.
3869   SmallVector<CCValAssign, 16> ArgLocs;
3870   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3871   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3872 
3873   if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3874     // With a fixed ABI, allocate fixed registers before user arguments.
3875     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3876   }
3877 
3878   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3879 
3880   // Get a count of how many bytes are to be pushed on the stack.
3881   unsigned NumBytes = CCInfo.getStackSize();
3882 
3883   if (IsSibCall) {
3884     // Since we're not changing the ABI to make this a tail call, the memory
3885     // operands are already available in the caller's incoming argument space.
3886     NumBytes = 0;
3887   }
3888 
3889   // FPDiff is the byte offset of the call's argument area from the callee's.
3890   // Stores to callee stack arguments will be placed in FixedStackSlots offset
3891   // by this amount for a tail call. In a sibling call it must be 0 because the
3892   // caller will deallocate the entire stack and the callee still expects its
3893   // arguments to begin at SP+0. Completely unused for non-tail calls.
3894   int32_t FPDiff = 0;
3895   MachineFrameInfo &MFI = MF.getFrameInfo();
3896   auto *TRI = Subtarget->getRegisterInfo();
3897 
3898   // Adjust the stack pointer for the new arguments...
3899   // These operations are automatically eliminated by the prolog/epilog pass
3900   if (!IsSibCall)
3901     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3902 
3903   if (!IsSibCall || IsChainCallConv) {
3904     if (!Subtarget->enableFlatScratch()) {
3905       SmallVector<SDValue, 4> CopyFromChains;
3906 
3907       // In the HSA case, this should be an identity copy.
3908       SDValue ScratchRSrcReg =
3909           DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3910       RegsToPass.emplace_back(IsChainCallConv
3911                                   ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3912                                   : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3913                               ScratchRSrcReg);
3914       CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3915       Chain = DAG.getTokenFactor(DL, CopyFromChains);
3916     }
3917   }
3918 
3919   const unsigned NumSpecialInputs = RegsToPass.size();
3920 
3921   MVT PtrVT = MVT::i32;
3922 
3923   // Walk the register/memloc assignments, inserting copies/loads.
3924   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3925     CCValAssign &VA = ArgLocs[i];
3926     SDValue Arg = OutVals[i];
3927 
3928     // Promote the value if needed.
3929     switch (VA.getLocInfo()) {
3930     case CCValAssign::Full:
3931       break;
3932     case CCValAssign::BCvt:
3933       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3934       break;
3935     case CCValAssign::ZExt:
3936       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3937       break;
3938     case CCValAssign::SExt:
3939       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3940       break;
3941     case CCValAssign::AExt:
3942       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3943       break;
3944     case CCValAssign::FPExt:
3945       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3946       break;
3947     default:
3948       llvm_unreachable("Unknown loc info!");
3949     }
3950 
3951     if (VA.isRegLoc()) {
3952       RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3953     } else {
3954       assert(VA.isMemLoc());
3955 
3956       SDValue DstAddr;
3957       MachinePointerInfo DstInfo;
3958 
3959       unsigned LocMemOffset = VA.getLocMemOffset();
3960       int32_t Offset = LocMemOffset;
3961 
3962       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3963       MaybeAlign Alignment;
3964 
3965       if (IsTailCall) {
3966         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3967         unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3968                                           : VA.getValVT().getStoreSize();
3969 
3970         // FIXME: We can have better than the minimum byval required alignment.
3971         Alignment =
3972             Flags.isByVal()
3973                 ? Flags.getNonZeroByValAlign()
3974                 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3975 
3976         Offset = Offset + FPDiff;
3977         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3978 
3979         DstAddr = DAG.getFrameIndex(FI, PtrVT);
3980         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3981 
3982         // Make sure any stack arguments overlapping with where we're storing
3983         // are loaded before this eventual operation. Otherwise they'll be
3984         // clobbered.
3985 
3986         // FIXME: Why is this really necessary? This seems to just result in a
3987         // lot of code to copy the stack and write them back to the same
3988         // locations, which are supposed to be immutable?
3989         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3990       } else {
3991         // Stores to the argument stack area are relative to the stack pointer.
3992         SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3993                                         MVT::i32);
3994         DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3995         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3996         Alignment =
3997             commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3998       }
3999 
4000       if (Outs[i].Flags.isByVal()) {
4001         SDValue SizeNode =
4002             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4003         SDValue Cpy =
4004             DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4005                           Outs[i].Flags.getNonZeroByValAlign(),
4006                           /*isVol = */ false, /*AlwaysInline = */ true,
4007                           /*CI=*/nullptr, std::nullopt, DstInfo,
4008                           MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4009 
4010         MemOpChains.push_back(Cpy);
4011       } else {
4012         SDValue Store =
4013             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4014         MemOpChains.push_back(Store);
4015       }
4016     }
4017   }
4018 
4019   if (!MemOpChains.empty())
4020     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4021 
4022   SDValue ReadFirstLaneID =
4023       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4024 
4025   SDValue TokenGlue;
4026   if (CLI.ConvergenceControlToken) {
4027     TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4028                             CLI.ConvergenceControlToken);
4029   }
4030 
4031   // Build a sequence of copy-to-reg nodes chained together with token chain
4032   // and flag operands which copy the outgoing args into the appropriate regs.
4033   SDValue InGlue;
4034 
4035   unsigned ArgIdx = 0;
4036   for (auto [Reg, Val] : RegsToPass) {
4037     if (ArgIdx++ >= NumSpecialInputs &&
4038         (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4039       // For chain calls, the inreg arguments are required to be
4040       // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4041       // they are uniform.
4042       //
4043       // For other calls, if an inreg arguments is known to be uniform,
4044       // speculatively insert a readfirstlane in case it is in a VGPR.
4045       //
4046       // FIXME: We need to execute this in a waterfall loop if it is a divergent
4047       // value, so let that continue to produce invalid code.
4048 
4049       SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4050       if (TokenGlue)
4051         ReadfirstlaneArgs.push_back(TokenGlue);
4052       Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
4053                         ReadfirstlaneArgs);
4054     }
4055 
4056     Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4057     InGlue = Chain.getValue(1);
4058   }
4059 
4060   // We don't usually want to end the call-sequence here because we would tidy
4061   // the frame up *after* the call, however in the ABI-changing tail-call case
4062   // we've carefully laid out the parameters so that when sp is reset they'll be
4063   // in the correct location.
4064   if (IsTailCall && !IsSibCall) {
4065     Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4066     InGlue = Chain.getValue(1);
4067   }
4068 
4069   std::vector<SDValue> Ops({Chain});
4070 
4071   // Add a redundant copy of the callee global which will not be legalized, as
4072   // we need direct access to the callee later.
4073   if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
4074     const GlobalValue *GV = GSD->getGlobal();
4075     Ops.push_back(Callee);
4076     Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4077   } else {
4078     if (IsTailCall) {
4079       // isEligibleForTailCallOptimization considered whether the call target is
4080       // divergent, but we may still end up with a uniform value in a VGPR.
4081       // Insert a readfirstlane just in case.
4082       SDValue ReadFirstLaneID =
4083           DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4084 
4085       SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4086       if (TokenGlue)
4087         ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4088       Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4089                            ReadfirstlaneArgs);
4090     }
4091 
4092     Ops.push_back(Callee);
4093     Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4094   }
4095 
4096   if (IsTailCall) {
4097     // Each tail call may have to adjust the stack by a different amount, so
4098     // this information must travel along with the operation for eventual
4099     // consumption by emitEpilogue.
4100     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4101   }
4102 
4103   if (IsChainCallConv)
4104     llvm::append_range(Ops, ChainCallSpecialArgs);
4105 
4106   // Add argument registers to the end of the list so that they are known live
4107   // into the call.
4108   for (auto &[Reg, Val] : RegsToPass)
4109     Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4110 
4111   // Add a register mask operand representing the call-preserved registers.
4112   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4113   assert(Mask && "Missing call preserved mask for calling convention");
4114   Ops.push_back(DAG.getRegisterMask(Mask));
4115 
4116   if (SDValue Token = CLI.ConvergenceControlToken) {
4117     SmallVector<SDValue, 2> GlueOps;
4118     GlueOps.push_back(Token);
4119     if (InGlue)
4120       GlueOps.push_back(InGlue);
4121 
4122     InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4123                                         MVT::Glue, GlueOps),
4124                      0);
4125   }
4126 
4127   if (InGlue)
4128     Ops.push_back(InGlue);
4129 
4130   // If we're doing a tall call, use a TC_RETURN here rather than an
4131   // actual call instruction.
4132   if (IsTailCall) {
4133     MFI.setHasTailCall();
4134     unsigned OPC = AMDGPUISD::TC_RETURN;
4135     switch (CallConv) {
4136     case CallingConv::AMDGPU_Gfx:
4137       OPC = AMDGPUISD::TC_RETURN_GFX;
4138       break;
4139     case CallingConv::AMDGPU_CS_Chain:
4140     case CallingConv::AMDGPU_CS_ChainPreserve:
4141       OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4142                              : AMDGPUISD::TC_RETURN_CHAIN;
4143       break;
4144     }
4145 
4146     return DAG.getNode(OPC, DL, MVT::Other, Ops);
4147   }
4148 
4149   // Returns a chain and a flag for retval copy to use.
4150   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4151   Chain = Call.getValue(0);
4152   InGlue = Call.getValue(1);
4153 
4154   uint64_t CalleePopBytes = NumBytes;
4155   Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4156   if (!Ins.empty())
4157     InGlue = Chain.getValue(1);
4158 
4159   // Handle result values, copying them out of physregs into vregs that we
4160   // return.
4161   return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4162                          InVals, /*IsThisReturn=*/false, SDValue());
4163 }
4164 
4165 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4166 // except for:
4167 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4168 // 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4169 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4170                                                   SelectionDAG &DAG) const {
4171   const MachineFunction &MF = DAG.getMachineFunction();
4172   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4173 
4174   SDLoc dl(Op);
4175   EVT VT = Op.getValueType();
4176   SDValue Chain = Op.getOperand(0);
4177   Register SPReg = Info->getStackPtrOffsetReg();
4178 
4179   // Chain the dynamic stack allocation so that it doesn't modify the stack
4180   // pointer when other instructions are using the stack.
4181   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4182 
4183   SDValue Size = Op.getOperand(1);
4184   SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4185   Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4186 
4187   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4188   assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4189          "Stack grows upwards for AMDGPU");
4190 
4191   Chain = BaseAddr.getValue(1);
4192   Align StackAlign = TFL->getStackAlign();
4193   if (Alignment > StackAlign) {
4194     uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4195                                << Subtarget->getWavefrontSizeLog2();
4196     uint64_t StackAlignMask = ScaledAlignment - 1;
4197     SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4198                                   DAG.getConstant(StackAlignMask, dl, VT));
4199     BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4200                            DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4201   }
4202 
4203   assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4204   SDValue NewSP;
4205   if (isa<ConstantSDNode>(Size)) {
4206     // For constant sized alloca, scale alloca size by wave-size
4207     SDValue ScaledSize = DAG.getNode(
4208         ISD::SHL, dl, VT, Size,
4209         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4210     NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4211   } else {
4212     // For dynamic sized alloca, perform wave-wide reduction to get max of
4213     // alloca size(divergent) and then scale it by wave-size
4214     SDValue WaveReduction =
4215         DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4216     Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4217                        Size, DAG.getConstant(0, dl, MVT::i32));
4218     SDValue ScaledSize = DAG.getNode(
4219         ISD::SHL, dl, VT, Size,
4220         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4221     NewSP =
4222         DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4223     SDValue ReadFirstLaneID =
4224         DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4225     NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4226                         NewSP);
4227   }
4228 
4229   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4230   SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4231 
4232   return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4233 }
4234 
4235 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4236   if (Op.getValueType() != MVT::i32)
4237     return Op; // Defer to cannot select error.
4238 
4239   Register SP = getStackPointerRegisterToSaveRestore();
4240   SDLoc SL(Op);
4241 
4242   SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4243 
4244   // Convert from wave uniform to swizzled vector address. This should protect
4245   // from any edge cases where the stacksave result isn't directly used with
4246   // stackrestore.
4247   SDValue VectorAddress =
4248       DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4249   return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4250 }
4251 
4252 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4253                                             SelectionDAG &DAG) const {
4254   SDLoc SL(Op);
4255   assert(Op.getValueType() == MVT::i32);
4256 
4257   uint32_t BothRoundHwReg =
4258       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4259   SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4260 
4261   SDValue IntrinID =
4262       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4263   SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4264                                Op.getOperand(0), IntrinID, GetRoundBothImm);
4265 
4266   // There are two rounding modes, one for f32 and one for f64/f16. We only
4267   // report in the standard value range if both are the same.
4268   //
4269   // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4270   // ties away from zero is not supported, and the other values are rotated by
4271   // 1.
4272   //
4273   // If the two rounding modes are not the same, report a target defined value.
4274 
4275   // Mode register rounding mode fields:
4276   //
4277   // [1:0] Single-precision round mode.
4278   // [3:2] Double/Half-precision round mode.
4279   //
4280   // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4281   //
4282   //             Hardware   Spec
4283   // Toward-0        3        0
4284   // Nearest Even    0        1
4285   // +Inf            1        2
4286   // -Inf            2        3
4287   //  NearestAway0  N/A       4
4288   //
4289   // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4290   // table we can index by the raw hardware mode.
4291   //
4292   // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4293 
4294   SDValue BitTable =
4295       DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
4296 
4297   SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4298   SDValue RoundModeTimesNumBits =
4299       DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4300 
4301   // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4302   // knew only one mode was demanded.
4303   SDValue TableValue =
4304       DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4305   SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4306 
4307   SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4308   SDValue TableEntry =
4309       DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4310 
4311   // There's a gap in the 4-bit encoded table and actual enum values, so offset
4312   // if it's an extended value.
4313   SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4314   SDValue IsStandardValue =
4315       DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4316   SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4317   SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4318                                TableEntry, EnumOffset);
4319 
4320   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4321 }
4322 
4323 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4324                                             SelectionDAG &DAG) const {
4325   SDLoc SL(Op);
4326 
4327   SDValue NewMode = Op.getOperand(1);
4328   assert(NewMode.getValueType() == MVT::i32);
4329 
4330   // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4331   // hardware MODE.fp_round values.
4332   if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4333     uint32_t ClampedVal = std::min(
4334         static_cast<uint32_t>(ConstMode->getZExtValue()),
4335         static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4336     NewMode = DAG.getConstant(
4337         AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4338   } else {
4339     // If we know the input can only be one of the supported standard modes in
4340     // the range 0-3, we can use a simplified mapping to hardware values.
4341     KnownBits KB = DAG.computeKnownBits(NewMode);
4342     const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4343     // The supported standard values are 0-3. The extended values start at 8. We
4344     // need to offset by 4 if the value is in the extended range.
4345 
4346     if (UseReducedTable) {
4347       // Truncate to the low 32-bits.
4348       SDValue BitTable = DAG.getConstant(
4349           AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4350 
4351       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4352       SDValue RoundModeTimesNumBits =
4353           DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4354 
4355       NewMode =
4356           DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4357 
4358       // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4359       // the table extracted bits into inline immediates.
4360     } else {
4361       // table_index = umin(value, value - 4)
4362       // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4363       SDValue BitTable =
4364           DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4365 
4366       SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4367       SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4368       SDValue IndexVal =
4369           DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4370 
4371       SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4372       SDValue RoundModeTimesNumBits =
4373           DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4374 
4375       SDValue TableValue =
4376           DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4377       SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4378 
4379       // No need to mask out the high bits since the setreg will ignore them
4380       // anyway.
4381       NewMode = TruncTable;
4382     }
4383 
4384     // Insert a readfirstlane in case the value is a VGPR. We could do this
4385     // earlier and keep more operations scalar, but that interferes with
4386     // combining the source.
4387     SDValue ReadFirstLaneID =
4388         DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4389     NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4390                           ReadFirstLaneID, NewMode);
4391   }
4392 
4393   // N.B. The setreg will be later folded into s_round_mode on supported
4394   // targets.
4395   SDValue IntrinID =
4396       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4397   uint32_t BothRoundHwReg =
4398       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4399   SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4400 
4401   SDValue SetReg =
4402       DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4403                   IntrinID, RoundBothImm, NewMode);
4404 
4405   return SetReg;
4406 }
4407 
4408 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4409   if (Op->isDivergent())
4410     return SDValue();
4411 
4412   switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4413   case AMDGPUAS::FLAT_ADDRESS:
4414   case AMDGPUAS::GLOBAL_ADDRESS:
4415   case AMDGPUAS::CONSTANT_ADDRESS:
4416   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4417     break;
4418   default:
4419     return SDValue();
4420   }
4421 
4422   return Op;
4423 }
4424 
4425 // Work around DAG legality rules only based on the result type.
4426 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4427   bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4428   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4429   EVT SrcVT = Src.getValueType();
4430 
4431   if (SrcVT.getScalarType() != MVT::bf16)
4432     return Op;
4433 
4434   SDLoc SL(Op);
4435   SDValue BitCast =
4436       DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4437 
4438   EVT DstVT = Op.getValueType();
4439   if (IsStrict)
4440     llvm_unreachable("Need STRICT_BF16_TO_FP");
4441 
4442   return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4443 }
4444 
4445 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4446   SDLoc SL(Op);
4447   if (Op.getValueType() != MVT::i64)
4448     return Op;
4449 
4450   uint32_t ModeHwReg =
4451       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4452   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4453   uint32_t TrapHwReg =
4454       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4455   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4456 
4457   SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4458   SDValue IntrinID =
4459       DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4460   SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4461                                    Op.getOperand(0), IntrinID, ModeHwRegImm);
4462   SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4463                                    Op.getOperand(0), IntrinID, TrapHwRegImm);
4464   SDValue TokenReg =
4465       DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4466                   GetTrapReg.getValue(1));
4467 
4468   SDValue CvtPtr =
4469       DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4470   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4471 
4472   return DAG.getMergeValues({Result, TokenReg}, SL);
4473 }
4474 
4475 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4476   SDLoc SL(Op);
4477   if (Op.getOperand(1).getValueType() != MVT::i64)
4478     return Op;
4479 
4480   SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4481   SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4482                                    DAG.getConstant(0, SL, MVT::i32));
4483   SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4484                                    DAG.getConstant(1, SL, MVT::i32));
4485 
4486   SDValue ReadFirstLaneID =
4487       DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4488   NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4489                            ReadFirstLaneID, NewModeReg);
4490   NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4491                            ReadFirstLaneID, NewTrapReg);
4492 
4493   unsigned ModeHwReg =
4494       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4495   SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4496   unsigned TrapHwReg =
4497       AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4498   SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4499 
4500   SDValue IntrinID =
4501       DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4502   SDValue SetModeReg =
4503       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4504                   IntrinID, ModeHwRegImm, NewModeReg);
4505   SDValue SetTrapReg =
4506       DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4507                   IntrinID, TrapHwRegImm, NewTrapReg);
4508   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4509 }
4510 
4511 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4512                                              const MachineFunction &MF) const {
4513   const Function &Fn = MF.getFunction();
4514 
4515   Register Reg = StringSwitch<Register>(RegName)
4516                      .Case("m0", AMDGPU::M0)
4517                      .Case("exec", AMDGPU::EXEC)
4518                      .Case("exec_lo", AMDGPU::EXEC_LO)
4519                      .Case("exec_hi", AMDGPU::EXEC_HI)
4520                      .Case("flat_scratch", AMDGPU::FLAT_SCR)
4521                      .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4522                      .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4523                      .Default(Register());
4524   if (!Reg)
4525     return Reg;
4526 
4527   if (!Subtarget->hasFlatScrRegister() &&
4528       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4529     Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4530                                     "\" for subtarget."));
4531   }
4532 
4533   switch (Reg) {
4534   case AMDGPU::M0:
4535   case AMDGPU::EXEC_LO:
4536   case AMDGPU::EXEC_HI:
4537   case AMDGPU::FLAT_SCR_LO:
4538   case AMDGPU::FLAT_SCR_HI:
4539     if (VT.getSizeInBits() == 32)
4540       return Reg;
4541     break;
4542   case AMDGPU::EXEC:
4543   case AMDGPU::FLAT_SCR:
4544     if (VT.getSizeInBits() == 64)
4545       return Reg;
4546     break;
4547   default:
4548     llvm_unreachable("missing register type checking");
4549   }
4550 
4551   report_fatal_error(
4552       Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4553 }
4554 
4555 // If kill is not the last instruction, split the block so kill is always a
4556 // proper terminator.
4557 MachineBasicBlock *
4558 SITargetLowering::splitKillBlock(MachineInstr &MI,
4559                                  MachineBasicBlock *BB) const {
4560   MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4561   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4562   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4563   return SplitBB;
4564 }
4565 
4566 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4567 // \p MI will be the only instruction in the loop body block. Otherwise, it will
4568 // be the first instruction in the remainder block.
4569 //
4570 /// \returns { LoopBody, Remainder }
4571 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4572 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4573   MachineFunction *MF = MBB.getParent();
4574   MachineBasicBlock::iterator I(&MI);
4575 
4576   // To insert the loop we need to split the block. Move everything after this
4577   // point to a new block, and insert a new empty block between the two.
4578   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4579   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4580   MachineFunction::iterator MBBI(MBB);
4581   ++MBBI;
4582 
4583   MF->insert(MBBI, LoopBB);
4584   MF->insert(MBBI, RemainderBB);
4585 
4586   LoopBB->addSuccessor(LoopBB);
4587   LoopBB->addSuccessor(RemainderBB);
4588 
4589   // Move the rest of the block into a new block.
4590   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4591 
4592   if (InstInLoop) {
4593     auto Next = std::next(I);
4594 
4595     // Move instruction to loop body.
4596     LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4597 
4598     // Move the rest of the block.
4599     RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4600   } else {
4601     RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4602   }
4603 
4604   MBB.addSuccessor(LoopBB);
4605 
4606   return std::pair(LoopBB, RemainderBB);
4607 }
4608 
4609 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4610 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4611   MachineBasicBlock *MBB = MI.getParent();
4612   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4613   auto I = MI.getIterator();
4614   auto E = std::next(I);
4615 
4616   // clang-format off
4617   BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4618       .addImm(0);
4619   // clang-format on
4620 
4621   MIBundleBuilder Bundler(*MBB, I, E);
4622   finalizeBundle(*MBB, Bundler.begin());
4623 }
4624 
4625 MachineBasicBlock *
4626 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4627                                          MachineBasicBlock *BB) const {
4628   const DebugLoc &DL = MI.getDebugLoc();
4629 
4630   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4631 
4632   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4633 
4634   // Apparently kill flags are only valid if the def is in the same block?
4635   if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4636     Src->setIsKill(false);
4637 
4638   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4639 
4640   MachineBasicBlock::iterator I = LoopBB->end();
4641 
4642   const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4643       AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
4644 
4645   // Clear TRAP_STS.MEM_VIOL
4646   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4647       .addImm(0)
4648       .addImm(EncodedReg);
4649 
4650   bundleInstWithWaitcnt(MI);
4651 
4652   Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4653 
4654   // Load and check TRAP_STS.MEM_VIOL
4655   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4656       .addImm(EncodedReg);
4657 
4658   // FIXME: Do we need to use an isel pseudo that may clobber scc?
4659   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4660       .addReg(Reg, RegState::Kill)
4661       .addImm(0);
4662   // clang-format off
4663   BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4664       .addMBB(LoopBB);
4665   // clang-format on
4666 
4667   return RemainderBB;
4668 }
4669 
4670 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4671 // wavefront. If the value is uniform and just happens to be in a VGPR, this
4672 // will only do one iteration. In the worst case, this will loop 64 times.
4673 //
4674 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4675 static MachineBasicBlock::iterator
4676 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4677                        MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4678                        const DebugLoc &DL, const MachineOperand &Idx,
4679                        unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4680                        unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4681                        Register &SGPRIdxReg) {
4682 
4683   MachineFunction *MF = OrigBB.getParent();
4684   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4685   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4686   MachineBasicBlock::iterator I = LoopBB.begin();
4687 
4688   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4689   Register PhiExec = MRI.createVirtualRegister(BoolRC);
4690   Register NewExec = MRI.createVirtualRegister(BoolRC);
4691   Register CurrentIdxReg =
4692       MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4693   Register CondReg = MRI.createVirtualRegister(BoolRC);
4694 
4695   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4696       .addReg(InitReg)
4697       .addMBB(&OrigBB)
4698       .addReg(ResultReg)
4699       .addMBB(&LoopBB);
4700 
4701   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4702       .addReg(InitSaveExecReg)
4703       .addMBB(&OrigBB)
4704       .addReg(NewExec)
4705       .addMBB(&LoopBB);
4706 
4707   // Read the next variant <- also loop target.
4708   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4709       .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4710 
4711   // Compare the just read M0 value to all possible Idx values.
4712   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4713       .addReg(CurrentIdxReg)
4714       .addReg(Idx.getReg(), 0, Idx.getSubReg());
4715 
4716   // Update EXEC, save the original EXEC value to VCC.
4717   BuildMI(LoopBB, I, DL,
4718           TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4719                                  : AMDGPU::S_AND_SAVEEXEC_B64),
4720           NewExec)
4721       .addReg(CondReg, RegState::Kill);
4722 
4723   MRI.setSimpleHint(NewExec, CondReg);
4724 
4725   if (UseGPRIdxMode) {
4726     if (Offset == 0) {
4727       SGPRIdxReg = CurrentIdxReg;
4728     } else {
4729       SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4730       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4731           .addReg(CurrentIdxReg, RegState::Kill)
4732           .addImm(Offset);
4733     }
4734   } else {
4735     // Move index from VCC into M0
4736     if (Offset == 0) {
4737       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4738           .addReg(CurrentIdxReg, RegState::Kill);
4739     } else {
4740       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4741           .addReg(CurrentIdxReg, RegState::Kill)
4742           .addImm(Offset);
4743     }
4744   }
4745 
4746   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4747   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4748   MachineInstr *InsertPt =
4749       BuildMI(LoopBB, I, DL,
4750               TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4751                                      : AMDGPU::S_XOR_B64_term),
4752               Exec)
4753           .addReg(Exec)
4754           .addReg(NewExec);
4755 
4756   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4757   // s_cbranch_scc0?
4758 
4759   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4760   // clang-format off
4761   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4762       .addMBB(&LoopBB);
4763   // clang-format on
4764 
4765   return InsertPt->getIterator();
4766 }
4767 
4768 // This has slightly sub-optimal regalloc when the source vector is killed by
4769 // the read. The register allocator does not understand that the kill is
4770 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4771 // subregister from it, using 1 more VGPR than necessary. This was saved when
4772 // this was expanded after register allocation.
4773 static MachineBasicBlock::iterator
4774 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4775                unsigned InitResultReg, unsigned PhiReg, int Offset,
4776                bool UseGPRIdxMode, Register &SGPRIdxReg) {
4777   MachineFunction *MF = MBB.getParent();
4778   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4779   const SIRegisterInfo *TRI = ST.getRegisterInfo();
4780   MachineRegisterInfo &MRI = MF->getRegInfo();
4781   const DebugLoc &DL = MI.getDebugLoc();
4782   MachineBasicBlock::iterator I(&MI);
4783 
4784   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4785   Register DstReg = MI.getOperand(0).getReg();
4786   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4787   Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4788   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4789   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4790 
4791   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4792 
4793   // Save the EXEC mask
4794   // clang-format off
4795   BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4796       .addReg(Exec);
4797   // clang-format on
4798 
4799   auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
4800 
4801   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4802 
4803   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4804                                       InitResultReg, DstReg, PhiReg, TmpExec,
4805                                       Offset, UseGPRIdxMode, SGPRIdxReg);
4806 
4807   MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4808   MachineFunction::iterator MBBI(LoopBB);
4809   ++MBBI;
4810   MF->insert(MBBI, LandingPad);
4811   LoopBB->removeSuccessor(RemainderBB);
4812   LandingPad->addSuccessor(RemainderBB);
4813   LoopBB->addSuccessor(LandingPad);
4814   MachineBasicBlock::iterator First = LandingPad->begin();
4815   // clang-format off
4816   BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4817       .addReg(SaveExec);
4818   // clang-format on
4819 
4820   return InsPt;
4821 }
4822 
4823 // Returns subreg index, offset
4824 static std::pair<unsigned, int>
4825 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4826                             const TargetRegisterClass *SuperRC, unsigned VecReg,
4827                             int Offset) {
4828   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4829 
4830   // Skip out of bounds offsets, or else we would end up using an undefined
4831   // register.
4832   if (Offset >= NumElts || Offset < 0)
4833     return std::pair(AMDGPU::sub0, Offset);
4834 
4835   return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4836 }
4837 
4838 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4839                                  MachineRegisterInfo &MRI, MachineInstr &MI,
4840                                  int Offset) {
4841   MachineBasicBlock *MBB = MI.getParent();
4842   const DebugLoc &DL = MI.getDebugLoc();
4843   MachineBasicBlock::iterator I(&MI);
4844 
4845   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4846 
4847   assert(Idx->getReg() != AMDGPU::NoRegister);
4848 
4849   if (Offset == 0) {
4850     // clang-format off
4851     BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4852         .add(*Idx);
4853     // clang-format on
4854   } else {
4855     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4856         .add(*Idx)
4857         .addImm(Offset);
4858   }
4859 }
4860 
4861 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4862                                    MachineRegisterInfo &MRI, MachineInstr &MI,
4863                                    int Offset) {
4864   MachineBasicBlock *MBB = MI.getParent();
4865   const DebugLoc &DL = MI.getDebugLoc();
4866   MachineBasicBlock::iterator I(&MI);
4867 
4868   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4869 
4870   if (Offset == 0)
4871     return Idx->getReg();
4872 
4873   Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4874   BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4875       .add(*Idx)
4876       .addImm(Offset);
4877   return Tmp;
4878 }
4879 
4880 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4881                                           MachineBasicBlock &MBB,
4882                                           const GCNSubtarget &ST) {
4883   const SIInstrInfo *TII = ST.getInstrInfo();
4884   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4885   MachineFunction *MF = MBB.getParent();
4886   MachineRegisterInfo &MRI = MF->getRegInfo();
4887 
4888   Register Dst = MI.getOperand(0).getReg();
4889   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4890   Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4891   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4892 
4893   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4894   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4895 
4896   unsigned SubReg;
4897   std::tie(SubReg, Offset) =
4898       computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4899 
4900   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4901 
4902   // Check for a SGPR index.
4903   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4904     MachineBasicBlock::iterator I(&MI);
4905     const DebugLoc &DL = MI.getDebugLoc();
4906 
4907     if (UseGPRIdxMode) {
4908       // TODO: Look at the uses to avoid the copy. This may require rescheduling
4909       // to avoid interfering with other uses, so probably requires a new
4910       // optimization pass.
4911       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4912 
4913       const MCInstrDesc &GPRIDXDesc =
4914           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4915       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4916           .addReg(SrcReg)
4917           .addReg(Idx)
4918           .addImm(SubReg);
4919     } else {
4920       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4921 
4922       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4923           .addReg(SrcReg, 0, SubReg)
4924           .addReg(SrcReg, RegState::Implicit);
4925     }
4926 
4927     MI.eraseFromParent();
4928 
4929     return &MBB;
4930   }
4931 
4932   // Control flow needs to be inserted if indexing with a VGPR.
4933   const DebugLoc &DL = MI.getDebugLoc();
4934   MachineBasicBlock::iterator I(&MI);
4935 
4936   Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4937   Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4938 
4939   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4940 
4941   Register SGPRIdxReg;
4942   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4943                               UseGPRIdxMode, SGPRIdxReg);
4944 
4945   MachineBasicBlock *LoopBB = InsPt->getParent();
4946 
4947   if (UseGPRIdxMode) {
4948     const MCInstrDesc &GPRIDXDesc =
4949         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4950 
4951     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4952         .addReg(SrcReg)
4953         .addReg(SGPRIdxReg)
4954         .addImm(SubReg);
4955   } else {
4956     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4957         .addReg(SrcReg, 0, SubReg)
4958         .addReg(SrcReg, RegState::Implicit);
4959   }
4960 
4961   MI.eraseFromParent();
4962 
4963   return LoopBB;
4964 }
4965 
4966 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4967                                           MachineBasicBlock &MBB,
4968                                           const GCNSubtarget &ST) {
4969   const SIInstrInfo *TII = ST.getInstrInfo();
4970   const SIRegisterInfo &TRI = TII->getRegisterInfo();
4971   MachineFunction *MF = MBB.getParent();
4972   MachineRegisterInfo &MRI = MF->getRegInfo();
4973 
4974   Register Dst = MI.getOperand(0).getReg();
4975   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4976   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4977   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4978   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4979   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4980   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4981 
4982   // This can be an immediate, but will be folded later.
4983   assert(Val->getReg());
4984 
4985   unsigned SubReg;
4986   std::tie(SubReg, Offset) =
4987       computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
4988   const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4989 
4990   if (Idx->getReg() == AMDGPU::NoRegister) {
4991     MachineBasicBlock::iterator I(&MI);
4992     const DebugLoc &DL = MI.getDebugLoc();
4993 
4994     assert(Offset == 0);
4995 
4996     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4997         .add(*SrcVec)
4998         .add(*Val)
4999         .addImm(SubReg);
5000 
5001     MI.eraseFromParent();
5002     return &MBB;
5003   }
5004 
5005   // Check for a SGPR index.
5006   if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5007     MachineBasicBlock::iterator I(&MI);
5008     const DebugLoc &DL = MI.getDebugLoc();
5009 
5010     if (UseGPRIdxMode) {
5011       Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5012 
5013       const MCInstrDesc &GPRIDXDesc =
5014           TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5015       BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5016           .addReg(SrcVec->getReg())
5017           .add(*Val)
5018           .addReg(Idx)
5019           .addImm(SubReg);
5020     } else {
5021       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5022 
5023       const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5024           TRI.getRegSizeInBits(*VecRC), 32, false);
5025       BuildMI(MBB, I, DL, MovRelDesc, Dst)
5026           .addReg(SrcVec->getReg())
5027           .add(*Val)
5028           .addImm(SubReg);
5029     }
5030     MI.eraseFromParent();
5031     return &MBB;
5032   }
5033 
5034   // Control flow needs to be inserted if indexing with a VGPR.
5035   if (Val->isReg())
5036     MRI.clearKillFlags(Val->getReg());
5037 
5038   const DebugLoc &DL = MI.getDebugLoc();
5039 
5040   Register PhiReg = MRI.createVirtualRegister(VecRC);
5041 
5042   Register SGPRIdxReg;
5043   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5044                               UseGPRIdxMode, SGPRIdxReg);
5045   MachineBasicBlock *LoopBB = InsPt->getParent();
5046 
5047   if (UseGPRIdxMode) {
5048     const MCInstrDesc &GPRIDXDesc =
5049         TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5050 
5051     BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5052         .addReg(PhiReg)
5053         .add(*Val)
5054         .addReg(SGPRIdxReg)
5055         .addImm(SubReg);
5056   } else {
5057     const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5058         TRI.getRegSizeInBits(*VecRC), 32, false);
5059     BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5060         .addReg(PhiReg)
5061         .add(*Val)
5062         .addImm(SubReg);
5063   }
5064 
5065   MI.eraseFromParent();
5066   return LoopBB;
5067 }
5068 
5069 static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5070   switch (Opc) {
5071   case AMDGPU::S_MIN_U32:
5072     return std::numeric_limits<uint32_t>::max();
5073   case AMDGPU::S_MIN_I32:
5074     return std::numeric_limits<int32_t>::max();
5075   case AMDGPU::S_MAX_U32:
5076     return std::numeric_limits<uint32_t>::min();
5077   case AMDGPU::S_MAX_I32:
5078     return std::numeric_limits<int32_t>::min();
5079   case AMDGPU::S_ADD_I32:
5080   case AMDGPU::S_SUB_I32:
5081   case AMDGPU::S_OR_B32:
5082   case AMDGPU::S_XOR_B32:
5083     return std::numeric_limits<uint32_t>::min();
5084   case AMDGPU::S_AND_B32:
5085     return std::numeric_limits<uint32_t>::max();
5086   default:
5087     llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5088   }
5089 }
5090 
5091 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5092                                           MachineBasicBlock &BB,
5093                                           const GCNSubtarget &ST,
5094                                           unsigned Opc) {
5095   MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5096   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5097   const DebugLoc &DL = MI.getDebugLoc();
5098   const SIInstrInfo *TII = ST.getInstrInfo();
5099 
5100   // Reduction operations depend on whether the input operand is SGPR or VGPR.
5101   Register SrcReg = MI.getOperand(1).getReg();
5102   bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5103   Register DstReg = MI.getOperand(0).getReg();
5104   MachineBasicBlock *RetBB = nullptr;
5105   if (isSGPR) {
5106     switch (Opc) {
5107     case AMDGPU::S_MIN_U32:
5108     case AMDGPU::S_MIN_I32:
5109     case AMDGPU::S_MAX_U32:
5110     case AMDGPU::S_MAX_I32:
5111     case AMDGPU::S_AND_B32:
5112     case AMDGPU::S_OR_B32: {
5113       // Idempotent operations.
5114       BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5115       RetBB = &BB;
5116       break;
5117     }
5118     case AMDGPU::S_XOR_B32:
5119     case AMDGPU::S_ADD_I32:
5120     case AMDGPU::S_SUB_I32: {
5121       const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5122       const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5123       Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5124       Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5125 
5126       bool IsWave32 = ST.isWave32();
5127       unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5128       MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5129       unsigned CountReg =
5130           IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5131 
5132       auto Exec =
5133           BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5134 
5135       auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5136                                 .addReg(Exec->getOperand(0).getReg());
5137 
5138       switch (Opc) {
5139       case AMDGPU::S_XOR_B32: {
5140         // Performing an XOR operation on a uniform value
5141         // depends on the parity of the number of active lanes.
5142         // For even parity, the result will be 0, for odd
5143         // parity the result will be the same as the input value.
5144         Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5145 
5146         auto ParityReg =
5147             BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5148                 .addReg(NewAccumulator->getOperand(0).getReg())
5149                 .addImm(1);
5150         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5151             .addReg(SrcReg)
5152             .addReg(ParityReg->getOperand(0).getReg());
5153         break;
5154       }
5155       case AMDGPU::S_SUB_I32: {
5156         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5157 
5158         // Take the negation of the source operand.
5159         auto InvertedValReg =
5160             BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5161                 .addImm(-1)
5162                 .addReg(SrcReg);
5163         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5164             .addReg(InvertedValReg->getOperand(0).getReg())
5165             .addReg(NewAccumulator->getOperand(0).getReg());
5166         break;
5167       }
5168       case AMDGPU::S_ADD_I32: {
5169         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5170             .addReg(SrcReg)
5171             .addReg(NewAccumulator->getOperand(0).getReg());
5172         break;
5173       }
5174       }
5175       RetBB = &BB;
5176     }
5177     }
5178   } else {
5179     // TODO: Implement DPP Strategy and switch based on immediate strategy
5180     // operand. For now, for all the cases (default, Iterative and DPP we use
5181     // iterative approach by default.)
5182 
5183     // To reduce the VGPR using iterative approach, we need to iterate
5184     // over all the active lanes. Lowering consists of ComputeLoop,
5185     // which iterate over only active lanes. We use copy of EXEC register
5186     // as induction variable and every active lane modifies it using bitset0
5187     // so that we will get the next active lane for next iteration.
5188     MachineBasicBlock::iterator I = BB.end();
5189     Register SrcReg = MI.getOperand(1).getReg();
5190 
5191     // Create Control flow for loop
5192     // Split MI's Machine Basic block into For loop
5193     auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5194 
5195     // Create virtual registers required for lowering.
5196     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5197     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5198     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5199     Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5200 
5201     Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5202     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5203     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5204 
5205     Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5206     Register LaneValueReg =
5207         MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5208 
5209     bool IsWave32 = ST.isWave32();
5210     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5211     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5212 
5213     // Create initial values of induction variable from Exec, Accumulator and
5214     // insert branch instr to newly created ComputeBlock
5215     uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5216     auto TmpSReg =
5217         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5218     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5219         .addImm(InitalValue);
5220     // clang-format off
5221     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5222         .addMBB(ComputeLoop);
5223     // clang-format on
5224 
5225     // Start constructing ComputeLoop
5226     I = ComputeLoop->end();
5227     auto Accumulator =
5228         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5229             .addReg(InitalValReg)
5230             .addMBB(&BB);
5231     auto ActiveBits =
5232         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5233             .addReg(TmpSReg->getOperand(0).getReg())
5234             .addMBB(&BB);
5235 
5236     // Perform the computations
5237     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5238     auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5239                    .addReg(ActiveBits->getOperand(0).getReg());
5240     auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5241                              TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5242                          .addReg(SrcReg)
5243                          .addReg(FF1->getOperand(0).getReg());
5244     auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5245                               .addReg(Accumulator->getOperand(0).getReg())
5246                               .addReg(LaneValue->getOperand(0).getReg());
5247 
5248     // Manipulate the iterator to get the next active lane
5249     unsigned BITSETOpc =
5250         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5251     auto NewActiveBits =
5252         BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5253             .addReg(FF1->getOperand(0).getReg())
5254             .addReg(ActiveBits->getOperand(0).getReg());
5255 
5256     // Add phi nodes
5257     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5258         .addMBB(ComputeLoop);
5259     ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5260         .addMBB(ComputeLoop);
5261 
5262     // Creating branching
5263     unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5264     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5265         .addReg(NewActiveBits->getOperand(0).getReg())
5266         .addImm(0);
5267     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5268         .addMBB(ComputeLoop);
5269 
5270     RetBB = ComputeEnd;
5271   }
5272   MI.eraseFromParent();
5273   return RetBB;
5274 }
5275 
5276 MachineBasicBlock *
5277 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5278                                               MachineBasicBlock *BB) const {
5279 
5280   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5281   MachineFunction *MF = BB->getParent();
5282   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5283 
5284   switch (MI.getOpcode()) {
5285   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5286     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5287   case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5288     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5289   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5290     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5291   case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5292     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5293   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5294     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5295   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5296     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5297   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5298     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5299   case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5300     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5301   case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5302     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5303   case AMDGPU::S_UADDO_PSEUDO:
5304   case AMDGPU::S_USUBO_PSEUDO: {
5305     const DebugLoc &DL = MI.getDebugLoc();
5306     MachineOperand &Dest0 = MI.getOperand(0);
5307     MachineOperand &Dest1 = MI.getOperand(1);
5308     MachineOperand &Src0 = MI.getOperand(2);
5309     MachineOperand &Src1 = MI.getOperand(3);
5310 
5311     unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5312                        ? AMDGPU::S_ADD_I32
5313                        : AMDGPU::S_SUB_I32;
5314     // clang-format off
5315     BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5316         .add(Src0)
5317         .add(Src1);
5318     // clang-format on
5319 
5320     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5321         .addImm(1)
5322         .addImm(0);
5323 
5324     MI.eraseFromParent();
5325     return BB;
5326   }
5327   case AMDGPU::S_ADD_U64_PSEUDO:
5328   case AMDGPU::S_SUB_U64_PSEUDO: {
5329     // For targets older than GFX12, we emit a sequence of 32-bit operations.
5330     // For GFX12, we emit s_add_u64 and s_sub_u64.
5331     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5332     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5333     const DebugLoc &DL = MI.getDebugLoc();
5334     MachineOperand &Dest = MI.getOperand(0);
5335     MachineOperand &Src0 = MI.getOperand(1);
5336     MachineOperand &Src1 = MI.getOperand(2);
5337     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5338     if (Subtarget->hasScalarAddSub64()) {
5339       unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5340       // clang-format off
5341       BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5342           .add(Src0)
5343           .add(Src1);
5344       // clang-format on
5345     } else {
5346       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5347       const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5348 
5349       Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5350       Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351 
5352       MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5353           MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5354       MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5355           MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5356 
5357       MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5358           MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5359       MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5360           MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5361 
5362       unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5363       unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5364       BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5365           .add(Src0Sub0)
5366           .add(Src1Sub0);
5367       BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5368           .add(Src0Sub1)
5369           .add(Src1Sub1);
5370       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5371           .addReg(DestSub0)
5372           .addImm(AMDGPU::sub0)
5373           .addReg(DestSub1)
5374           .addImm(AMDGPU::sub1);
5375     }
5376     MI.eraseFromParent();
5377     return BB;
5378   }
5379   case AMDGPU::V_ADD_U64_PSEUDO:
5380   case AMDGPU::V_SUB_U64_PSEUDO: {
5381     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5382     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5383     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5384     const DebugLoc &DL = MI.getDebugLoc();
5385 
5386     bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5387 
5388     MachineOperand &Dest = MI.getOperand(0);
5389     MachineOperand &Src0 = MI.getOperand(1);
5390     MachineOperand &Src1 = MI.getOperand(2);
5391 
5392     if (IsAdd && ST.hasLshlAddU64Inst()) {
5393       auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5394                          Dest.getReg())
5395                      .add(Src0)
5396                      .addImm(0)
5397                      .add(Src1);
5398       TII->legalizeOperands(*Add);
5399       MI.eraseFromParent();
5400       return BB;
5401     }
5402 
5403     const auto *CarryRC = TRI->getWaveMaskRegClass();
5404 
5405     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5406     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5407 
5408     Register CarryReg = MRI.createVirtualRegister(CarryRC);
5409     Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5410 
5411     const TargetRegisterClass *Src0RC = Src0.isReg()
5412                                             ? MRI.getRegClass(Src0.getReg())
5413                                             : &AMDGPU::VReg_64RegClass;
5414     const TargetRegisterClass *Src1RC = Src1.isReg()
5415                                             ? MRI.getRegClass(Src1.getReg())
5416                                             : &AMDGPU::VReg_64RegClass;
5417 
5418     const TargetRegisterClass *Src0SubRC =
5419         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5420     const TargetRegisterClass *Src1SubRC =
5421         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5422 
5423     MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5424         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5425     MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5426         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5427 
5428     MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5429         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5430     MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5431         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5432 
5433     unsigned LoOpc =
5434         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5435     MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5436                                .addReg(CarryReg, RegState::Define)
5437                                .add(SrcReg0Sub0)
5438                                .add(SrcReg1Sub0)
5439                                .addImm(0); // clamp bit
5440 
5441     unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5442     MachineInstr *HiHalf =
5443         BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5444             .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5445             .add(SrcReg0Sub1)
5446             .add(SrcReg1Sub1)
5447             .addReg(CarryReg, RegState::Kill)
5448             .addImm(0); // clamp bit
5449 
5450     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451         .addReg(DestSub0)
5452         .addImm(AMDGPU::sub0)
5453         .addReg(DestSub1)
5454         .addImm(AMDGPU::sub1);
5455     TII->legalizeOperands(*LoHalf);
5456     TII->legalizeOperands(*HiHalf);
5457     MI.eraseFromParent();
5458     return BB;
5459   }
5460   case AMDGPU::S_ADD_CO_PSEUDO:
5461   case AMDGPU::S_SUB_CO_PSEUDO: {
5462     // This pseudo has a chance to be selected
5463     // only from uniform add/subcarry node. All the VGPR operands
5464     // therefore assumed to be splat vectors.
5465     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5466     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5467     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5468     MachineBasicBlock::iterator MII = MI;
5469     const DebugLoc &DL = MI.getDebugLoc();
5470     MachineOperand &Dest = MI.getOperand(0);
5471     MachineOperand &CarryDest = MI.getOperand(1);
5472     MachineOperand &Src0 = MI.getOperand(2);
5473     MachineOperand &Src1 = MI.getOperand(3);
5474     MachineOperand &Src2 = MI.getOperand(4);
5475     unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5476                        ? AMDGPU::S_ADDC_U32
5477                        : AMDGPU::S_SUBB_U32;
5478     if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5479       Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5480       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5481           .addReg(Src0.getReg());
5482       Src0.setReg(RegOp0);
5483     }
5484     if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5485       Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5486       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5487           .addReg(Src1.getReg());
5488       Src1.setReg(RegOp1);
5489     }
5490     Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5491     if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5492       BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5493           .addReg(Src2.getReg());
5494       Src2.setReg(RegOp2);
5495     }
5496 
5497     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5498     unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5499     assert(WaveSize == 64 || WaveSize == 32);
5500 
5501     if (WaveSize == 64) {
5502       if (ST.hasScalarCompareEq64()) {
5503         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5504             .addReg(Src2.getReg())
5505             .addImm(0);
5506       } else {
5507         const TargetRegisterClass *SubRC =
5508             TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5509         MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5510             MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5511         MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5512             MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5513         Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5514 
5515         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5516             .add(Src2Sub0)
5517             .add(Src2Sub1);
5518 
5519         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5520             .addReg(Src2_32, RegState::Kill)
5521             .addImm(0);
5522       }
5523     } else {
5524       BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5525           .addReg(Src2.getReg())
5526           .addImm(0);
5527     }
5528 
5529     // clang-format off
5530     BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5531         .add(Src0)
5532         .add(Src1);
5533     // clang-format on
5534 
5535     unsigned SelOpc =
5536         (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5537 
5538     BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5539         .addImm(-1)
5540         .addImm(0);
5541 
5542     MI.eraseFromParent();
5543     return BB;
5544   }
5545   case AMDGPU::SI_INIT_M0: {
5546     MachineOperand &M0Init = MI.getOperand(0);
5547     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5548             TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5549             AMDGPU::M0)
5550         .add(M0Init);
5551     MI.eraseFromParent();
5552     return BB;
5553   }
5554   case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5555     // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5556     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5557             TII->get(AMDGPU::S_CMP_EQ_U32))
5558         .addImm(0)
5559         .addImm(0);
5560     return BB;
5561   }
5562   case AMDGPU::GET_GROUPSTATICSIZE: {
5563     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5564            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5565     DebugLoc DL = MI.getDebugLoc();
5566     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5567         .add(MI.getOperand(0))
5568         .addImm(MFI->getLDSSize());
5569     MI.eraseFromParent();
5570     return BB;
5571   }
5572   case AMDGPU::GET_SHADERCYCLESHILO: {
5573     assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5574     MachineRegisterInfo &MRI = MF->getRegInfo();
5575     const DebugLoc &DL = MI.getDebugLoc();
5576     // The algorithm is:
5577     //
5578     // hi1 = getreg(SHADER_CYCLES_HI)
5579     // lo1 = getreg(SHADER_CYCLES_LO)
5580     // hi2 = getreg(SHADER_CYCLES_HI)
5581     //
5582     // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5583     // Otherwise there was overflow and the result is hi2:0. In both cases the
5584     // result should represent the actual time at some point during the sequence
5585     // of three getregs.
5586     using namespace AMDGPU::Hwreg;
5587     Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5588     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5589         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5590     Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5591     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5592         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5593     Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5594     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5595         .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5596     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5597         .addReg(RegHi1)
5598         .addReg(RegHi2);
5599     Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5600     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5601         .addReg(RegLo1)
5602         .addImm(0);
5603     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5604         .add(MI.getOperand(0))
5605         .addReg(RegLo)
5606         .addImm(AMDGPU::sub0)
5607         .addReg(RegHi2)
5608         .addImm(AMDGPU::sub1);
5609     MI.eraseFromParent();
5610     return BB;
5611   }
5612   case AMDGPU::SI_INDIRECT_SRC_V1:
5613   case AMDGPU::SI_INDIRECT_SRC_V2:
5614   case AMDGPU::SI_INDIRECT_SRC_V4:
5615   case AMDGPU::SI_INDIRECT_SRC_V8:
5616   case AMDGPU::SI_INDIRECT_SRC_V9:
5617   case AMDGPU::SI_INDIRECT_SRC_V10:
5618   case AMDGPU::SI_INDIRECT_SRC_V11:
5619   case AMDGPU::SI_INDIRECT_SRC_V12:
5620   case AMDGPU::SI_INDIRECT_SRC_V16:
5621   case AMDGPU::SI_INDIRECT_SRC_V32:
5622     return emitIndirectSrc(MI, *BB, *getSubtarget());
5623   case AMDGPU::SI_INDIRECT_DST_V1:
5624   case AMDGPU::SI_INDIRECT_DST_V2:
5625   case AMDGPU::SI_INDIRECT_DST_V4:
5626   case AMDGPU::SI_INDIRECT_DST_V8:
5627   case AMDGPU::SI_INDIRECT_DST_V9:
5628   case AMDGPU::SI_INDIRECT_DST_V10:
5629   case AMDGPU::SI_INDIRECT_DST_V11:
5630   case AMDGPU::SI_INDIRECT_DST_V12:
5631   case AMDGPU::SI_INDIRECT_DST_V16:
5632   case AMDGPU::SI_INDIRECT_DST_V32:
5633     return emitIndirectDst(MI, *BB, *getSubtarget());
5634   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5635   case AMDGPU::SI_KILL_I1_PSEUDO:
5636     return splitKillBlock(MI, BB);
5637   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5638     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5639     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5640     const SIRegisterInfo *TRI = ST.getRegisterInfo();
5641 
5642     Register Dst = MI.getOperand(0).getReg();
5643     const MachineOperand &Src0 = MI.getOperand(1);
5644     const MachineOperand &Src1 = MI.getOperand(2);
5645     const DebugLoc &DL = MI.getDebugLoc();
5646     Register SrcCond = MI.getOperand(3).getReg();
5647 
5648     Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5649     Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5650     const auto *CondRC = TRI->getWaveMaskRegClass();
5651     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5652 
5653     const TargetRegisterClass *Src0RC = Src0.isReg()
5654                                             ? MRI.getRegClass(Src0.getReg())
5655                                             : &AMDGPU::VReg_64RegClass;
5656     const TargetRegisterClass *Src1RC = Src1.isReg()
5657                                             ? MRI.getRegClass(Src1.getReg())
5658                                             : &AMDGPU::VReg_64RegClass;
5659 
5660     const TargetRegisterClass *Src0SubRC =
5661         TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5662     const TargetRegisterClass *Src1SubRC =
5663         TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5664 
5665     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5666         MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5667     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5668         MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5669 
5670     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5671         MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5672     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5673         MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5674 
5675     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5676     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5677         .addImm(0)
5678         .add(Src0Sub0)
5679         .addImm(0)
5680         .add(Src1Sub0)
5681         .addReg(SrcCondCopy);
5682     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5683         .addImm(0)
5684         .add(Src0Sub1)
5685         .addImm(0)
5686         .add(Src1Sub1)
5687         .addReg(SrcCondCopy);
5688 
5689     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5690         .addReg(DstLo)
5691         .addImm(AMDGPU::sub0)
5692         .addReg(DstHi)
5693         .addImm(AMDGPU::sub1);
5694     MI.eraseFromParent();
5695     return BB;
5696   }
5697   case AMDGPU::SI_BR_UNDEF: {
5698     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5699     const DebugLoc &DL = MI.getDebugLoc();
5700     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5701                            .add(MI.getOperand(0));
5702     Br->getOperand(1).setIsUndef(); // read undef SCC
5703     MI.eraseFromParent();
5704     return BB;
5705   }
5706   case AMDGPU::ADJCALLSTACKUP:
5707   case AMDGPU::ADJCALLSTACKDOWN: {
5708     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5709     MachineInstrBuilder MIB(*MF, &MI);
5710     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5711         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5712     return BB;
5713   }
5714   case AMDGPU::SI_CALL_ISEL: {
5715     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5716     const DebugLoc &DL = MI.getDebugLoc();
5717 
5718     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5719 
5720     MachineInstrBuilder MIB;
5721     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5722 
5723     for (const MachineOperand &MO : MI.operands())
5724       MIB.add(MO);
5725 
5726     MIB.cloneMemRefs(MI);
5727     MI.eraseFromParent();
5728     return BB;
5729   }
5730   case AMDGPU::V_ADD_CO_U32_e32:
5731   case AMDGPU::V_SUB_CO_U32_e32:
5732   case AMDGPU::V_SUBREV_CO_U32_e32: {
5733     // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5734     const DebugLoc &DL = MI.getDebugLoc();
5735     unsigned Opc = MI.getOpcode();
5736 
5737     bool NeedClampOperand = false;
5738     if (TII->pseudoToMCOpcode(Opc) == -1) {
5739       Opc = AMDGPU::getVOPe64(Opc);
5740       NeedClampOperand = true;
5741     }
5742 
5743     auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5744     if (TII->isVOP3(*I)) {
5745       const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5746       const SIRegisterInfo *TRI = ST.getRegisterInfo();
5747       I.addReg(TRI->getVCC(), RegState::Define);
5748     }
5749     I.add(MI.getOperand(1)).add(MI.getOperand(2));
5750     if (NeedClampOperand)
5751       I.addImm(0); // clamp bit for e64 encoding
5752 
5753     TII->legalizeOperands(*I);
5754 
5755     MI.eraseFromParent();
5756     return BB;
5757   }
5758   case AMDGPU::V_ADDC_U32_e32:
5759   case AMDGPU::V_SUBB_U32_e32:
5760   case AMDGPU::V_SUBBREV_U32_e32:
5761     // These instructions have an implicit use of vcc which counts towards the
5762     // constant bus limit.
5763     TII->legalizeOperands(MI);
5764     return BB;
5765   case AMDGPU::DS_GWS_INIT:
5766   case AMDGPU::DS_GWS_SEMA_BR:
5767   case AMDGPU::DS_GWS_BARRIER:
5768     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5769     [[fallthrough]];
5770   case AMDGPU::DS_GWS_SEMA_V:
5771   case AMDGPU::DS_GWS_SEMA_P:
5772   case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5773     // A s_waitcnt 0 is required to be the instruction immediately following.
5774     if (getSubtarget()->hasGWSAutoReplay()) {
5775       bundleInstWithWaitcnt(MI);
5776       return BB;
5777     }
5778 
5779     return emitGWSMemViolTestLoop(MI, BB);
5780   case AMDGPU::S_SETREG_B32: {
5781     // Try to optimize cases that only set the denormal mode or rounding mode.
5782     //
5783     // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5784     // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5785     // instead.
5786     //
5787     // FIXME: This could be predicates on the immediate, but tablegen doesn't
5788     // allow you to have a no side effect instruction in the output of a
5789     // sideeffecting pattern.
5790     auto [ID, Offset, Width] =
5791         AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5792     if (ID != AMDGPU::Hwreg::ID_MODE)
5793       return BB;
5794 
5795     const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5796     const unsigned SetMask = WidthMask << Offset;
5797 
5798     if (getSubtarget()->hasDenormModeInst()) {
5799       unsigned SetDenormOp = 0;
5800       unsigned SetRoundOp = 0;
5801 
5802       // The dedicated instructions can only set the whole denorm or round mode
5803       // at once, not a subset of bits in either.
5804       if (SetMask ==
5805           (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5806         // If this fully sets both the round and denorm mode, emit the two
5807         // dedicated instructions for these.
5808         SetRoundOp = AMDGPU::S_ROUND_MODE;
5809         SetDenormOp = AMDGPU::S_DENORM_MODE;
5810       } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5811         SetRoundOp = AMDGPU::S_ROUND_MODE;
5812       } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5813         SetDenormOp = AMDGPU::S_DENORM_MODE;
5814       }
5815 
5816       if (SetRoundOp || SetDenormOp) {
5817         MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5818         MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5819         if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5820           unsigned ImmVal = Def->getOperand(1).getImm();
5821           if (SetRoundOp) {
5822             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5823                 .addImm(ImmVal & 0xf);
5824 
5825             // If we also have the denorm mode, get just the denorm mode bits.
5826             ImmVal >>= 4;
5827           }
5828 
5829           if (SetDenormOp) {
5830             BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5831                 .addImm(ImmVal & 0xf);
5832           }
5833 
5834           MI.eraseFromParent();
5835           return BB;
5836         }
5837       }
5838     }
5839 
5840     // If only FP bits are touched, used the no side effects pseudo.
5841     if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5842                     AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5843       MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5844 
5845     return BB;
5846   }
5847   case AMDGPU::S_INVERSE_BALLOT_U32:
5848   case AMDGPU::S_INVERSE_BALLOT_U64:
5849     // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5850     // necessary. After that they are equivalent to a COPY.
5851     MI.setDesc(TII->get(AMDGPU::COPY));
5852     return BB;
5853   case AMDGPU::ENDPGM_TRAP: {
5854     const DebugLoc &DL = MI.getDebugLoc();
5855     if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5856       MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5857       MI.addOperand(MachineOperand::CreateImm(0));
5858       return BB;
5859     }
5860 
5861     // We need a block split to make the real endpgm a terminator. We also don't
5862     // want to break phis in successor blocks, so we can't just delete to the
5863     // end of the block.
5864 
5865     MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5866     MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5867     MF->push_back(TrapBB);
5868     // clang-format off
5869     BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5870         .addImm(0);
5871     BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5872         .addMBB(TrapBB);
5873     // clang-format on
5874 
5875     BB->addSuccessor(TrapBB);
5876     MI.eraseFromParent();
5877     return SplitBB;
5878   }
5879   case AMDGPU::SIMULATED_TRAP: {
5880     assert(Subtarget->hasPrivEnabledTrap2NopBug());
5881     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5882     MachineBasicBlock *SplitBB =
5883         TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5884     MI.eraseFromParent();
5885     return SplitBB;
5886   }
5887   default:
5888     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5889       if (!MI.mayStore())
5890         AddMemOpInit(MI);
5891       return BB;
5892     }
5893     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
5894   }
5895 }
5896 
5897 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5898   // This currently forces unfolding various combinations of fsub into fma with
5899   // free fneg'd operands. As long as we have fast FMA (controlled by
5900   // isFMAFasterThanFMulAndFAdd), we should perform these.
5901 
5902   // When fma is quarter rate, for f64 where add / sub are at best half rate,
5903   // most of these combines appear to be cycle neutral but save on instruction
5904   // count / code size.
5905   return true;
5906 }
5907 
5908 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5909 
5910 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5911                                          EVT VT) const {
5912   if (!VT.isVector()) {
5913     return MVT::i1;
5914   }
5915   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5916 }
5917 
5918 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5919   // TODO: Should i16 be used always if legal? For now it would force VALU
5920   // shifts.
5921   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5922 }
5923 
5924 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5925   return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5926              ? Ty.changeElementSize(16)
5927              : Ty.changeElementSize(32);
5928 }
5929 
5930 // Answering this is somewhat tricky and depends on the specific device which
5931 // have different rates for fma or all f64 operations.
5932 //
5933 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5934 // regardless of which device (although the number of cycles differs between
5935 // devices), so it is always profitable for f64.
5936 //
5937 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5938 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
5939 // which we can always do even without fused FP ops since it returns the same
5940 // result as the separate operations and since it is always full
5941 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5942 // however does not support denormals, so we do report fma as faster if we have
5943 // a fast fma device and require denormals.
5944 //
5945 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5946                                                   EVT VT) const {
5947   VT = VT.getScalarType();
5948 
5949   switch (VT.getSimpleVT().SimpleTy) {
5950   case MVT::f32: {
5951     // If mad is not available this depends only on if f32 fma is full rate.
5952     if (!Subtarget->hasMadMacF32Insts())
5953       return Subtarget->hasFastFMAF32();
5954 
5955     // Otherwise f32 mad is always full rate and returns the same result as
5956     // the separate operations so should be preferred over fma.
5957     // However does not support denormals.
5958     if (!denormalModeIsFlushAllF32(MF))
5959       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5960 
5961     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5962     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5963   }
5964   case MVT::f64:
5965     return true;
5966   case MVT::f16:
5967     return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5968   default:
5969     break;
5970   }
5971 
5972   return false;
5973 }
5974 
5975 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5976                                                   LLT Ty) const {
5977   switch (Ty.getScalarSizeInBits()) {
5978   case 16:
5979     return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5980   case 32:
5981     return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5982   case 64:
5983     return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5984   default:
5985     break;
5986   }
5987 
5988   return false;
5989 }
5990 
5991 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5992   if (!Ty.isScalar())
5993     return false;
5994 
5995   if (Ty.getScalarSizeInBits() == 16)
5996     return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5997   if (Ty.getScalarSizeInBits() == 32)
5998     return Subtarget->hasMadMacF32Insts() &&
5999            denormalModeIsFlushAllF32(*MI.getMF());
6000 
6001   return false;
6002 }
6003 
6004 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6005                                    const SDNode *N) const {
6006   // TODO: Check future ftz flag
6007   // v_mad_f32/v_mac_f32 do not support denormals.
6008   EVT VT = N->getValueType(0);
6009   if (VT == MVT::f32)
6010     return Subtarget->hasMadMacF32Insts() &&
6011            denormalModeIsFlushAllF32(DAG.getMachineFunction());
6012   if (VT == MVT::f16) {
6013     return Subtarget->hasMadF16() &&
6014            denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
6015   }
6016 
6017   return false;
6018 }
6019 
6020 //===----------------------------------------------------------------------===//
6021 // Custom DAG Lowering Operations
6022 //===----------------------------------------------------------------------===//
6023 
6024 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6025 // wider vector type is legal.
6026 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6027                                              SelectionDAG &DAG) const {
6028   unsigned Opc = Op.getOpcode();
6029   EVT VT = Op.getValueType();
6030   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6031          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6032          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6033          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6034 
6035   auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6036 
6037   SDLoc SL(Op);
6038   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6039   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6040 
6041   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6042 }
6043 
6044 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6045 // wider vector type is legal.
6046 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6047                                               SelectionDAG &DAG) const {
6048   unsigned Opc = Op.getOpcode();
6049   EVT VT = Op.getValueType();
6050   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6051          VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6052          VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6053          VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6054          VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6055          VT == MVT::v32bf16);
6056 
6057   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6058   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6059 
6060   SDLoc SL(Op);
6061 
6062   SDValue OpLo =
6063       DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6064   SDValue OpHi =
6065       DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6066 
6067   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6068 }
6069 
6070 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6071                                                SelectionDAG &DAG) const {
6072   unsigned Opc = Op.getOpcode();
6073   EVT VT = Op.getValueType();
6074   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6075          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6076          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6077          VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6078          VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6079          VT == MVT::v32bf16);
6080 
6081   SDValue Op0 = Op.getOperand(0);
6082   auto [Lo0, Hi0] = Op0.getValueType().isVector()
6083                         ? DAG.SplitVectorOperand(Op.getNode(), 0)
6084                         : std::pair(Op0, Op0);
6085 
6086   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6087   auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6088 
6089   SDLoc SL(Op);
6090   auto ResVT = DAG.GetSplitDestVTs(VT);
6091 
6092   SDValue OpLo =
6093       DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6094   SDValue OpHi =
6095       DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6096 
6097   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6098 }
6099 
6100 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6101   switch (Op.getOpcode()) {
6102   default:
6103     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
6104   case ISD::BRCOND:
6105     return LowerBRCOND(Op, DAG);
6106   case ISD::RETURNADDR:
6107     return LowerRETURNADDR(Op, DAG);
6108   case ISD::LOAD: {
6109     SDValue Result = LowerLOAD(Op, DAG);
6110     assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6111            "Load should return a value and a chain");
6112     return Result;
6113   }
6114   case ISD::FSQRT: {
6115     EVT VT = Op.getValueType();
6116     if (VT == MVT::f32)
6117       return lowerFSQRTF32(Op, DAG);
6118     if (VT == MVT::f64)
6119       return lowerFSQRTF64(Op, DAG);
6120     return SDValue();
6121   }
6122   case ISD::FSIN:
6123   case ISD::FCOS:
6124     return LowerTrig(Op, DAG);
6125   case ISD::SELECT:
6126     return LowerSELECT(Op, DAG);
6127   case ISD::FDIV:
6128     return LowerFDIV(Op, DAG);
6129   case ISD::FFREXP:
6130     return LowerFFREXP(Op, DAG);
6131   case ISD::ATOMIC_CMP_SWAP:
6132     return LowerATOMIC_CMP_SWAP(Op, DAG);
6133   case ISD::STORE:
6134     return LowerSTORE(Op, DAG);
6135   case ISD::GlobalAddress: {
6136     MachineFunction &MF = DAG.getMachineFunction();
6137     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6138     return LowerGlobalAddress(MFI, Op, DAG);
6139   }
6140   case ISD::INTRINSIC_WO_CHAIN:
6141     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6142   case ISD::INTRINSIC_W_CHAIN:
6143     return LowerINTRINSIC_W_CHAIN(Op, DAG);
6144   case ISD::INTRINSIC_VOID:
6145     return LowerINTRINSIC_VOID(Op, DAG);
6146   case ISD::ADDRSPACECAST:
6147     return lowerADDRSPACECAST(Op, DAG);
6148   case ISD::INSERT_SUBVECTOR:
6149     return lowerINSERT_SUBVECTOR(Op, DAG);
6150   case ISD::INSERT_VECTOR_ELT:
6151     return lowerINSERT_VECTOR_ELT(Op, DAG);
6152   case ISD::EXTRACT_VECTOR_ELT:
6153     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6154   case ISD::VECTOR_SHUFFLE:
6155     return lowerVECTOR_SHUFFLE(Op, DAG);
6156   case ISD::SCALAR_TO_VECTOR:
6157     return lowerSCALAR_TO_VECTOR(Op, DAG);
6158   case ISD::BUILD_VECTOR:
6159     return lowerBUILD_VECTOR(Op, DAG);
6160   case ISD::FP_ROUND:
6161   case ISD::STRICT_FP_ROUND:
6162     return lowerFP_ROUND(Op, DAG);
6163   case ISD::TRAP:
6164     return lowerTRAP(Op, DAG);
6165   case ISD::DEBUGTRAP:
6166     return lowerDEBUGTRAP(Op, DAG);
6167   case ISD::ABS:
6168   case ISD::FABS:
6169   case ISD::FNEG:
6170   case ISD::FCANONICALIZE:
6171   case ISD::BSWAP:
6172     return splitUnaryVectorOp(Op, DAG);
6173   case ISD::FMINNUM:
6174   case ISD::FMAXNUM:
6175     return lowerFMINNUM_FMAXNUM(Op, DAG);
6176   case ISD::FMINIMUMNUM:
6177   case ISD::FMAXIMUMNUM:
6178     return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6179   case ISD::FMINIMUM:
6180   case ISD::FMAXIMUM:
6181     return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6182   case ISD::FLDEXP:
6183   case ISD::STRICT_FLDEXP:
6184     return lowerFLDEXP(Op, DAG);
6185   case ISD::FMA:
6186     return splitTernaryVectorOp(Op, DAG);
6187   case ISD::FP_TO_SINT:
6188   case ISD::FP_TO_UINT:
6189     return LowerFP_TO_INT(Op, DAG);
6190   case ISD::SHL:
6191   case ISD::SRA:
6192   case ISD::SRL:
6193   case ISD::ADD:
6194   case ISD::SUB:
6195   case ISD::SMIN:
6196   case ISD::SMAX:
6197   case ISD::UMIN:
6198   case ISD::UMAX:
6199   case ISD::FADD:
6200   case ISD::FMUL:
6201   case ISD::FMINNUM_IEEE:
6202   case ISD::FMAXNUM_IEEE:
6203   case ISD::UADDSAT:
6204   case ISD::USUBSAT:
6205   case ISD::SADDSAT:
6206   case ISD::SSUBSAT:
6207     return splitBinaryVectorOp(Op, DAG);
6208   case ISD::FCOPYSIGN:
6209     return lowerFCOPYSIGN(Op, DAG);
6210   case ISD::MUL:
6211     return lowerMUL(Op, DAG);
6212   case ISD::SMULO:
6213   case ISD::UMULO:
6214     return lowerXMULO(Op, DAG);
6215   case ISD::SMUL_LOHI:
6216   case ISD::UMUL_LOHI:
6217     return lowerXMUL_LOHI(Op, DAG);
6218   case ISD::DYNAMIC_STACKALLOC:
6219     return LowerDYNAMIC_STACKALLOC(Op, DAG);
6220   case ISD::STACKSAVE:
6221     return LowerSTACKSAVE(Op, DAG);
6222   case ISD::GET_ROUNDING:
6223     return lowerGET_ROUNDING(Op, DAG);
6224   case ISD::SET_ROUNDING:
6225     return lowerSET_ROUNDING(Op, DAG);
6226   case ISD::PREFETCH:
6227     return lowerPREFETCH(Op, DAG);
6228   case ISD::FP_EXTEND:
6229   case ISD::STRICT_FP_EXTEND:
6230     return lowerFP_EXTEND(Op, DAG);
6231   case ISD::GET_FPENV:
6232     return lowerGET_FPENV(Op, DAG);
6233   case ISD::SET_FPENV:
6234     return lowerSET_FPENV(Op, DAG);
6235   }
6236   return SDValue();
6237 }
6238 
6239 // Used for D16: Casts the result of an instruction into the right vector,
6240 // packs values if loads return unpacked values.
6241 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
6242                                        const SDLoc &DL, SelectionDAG &DAG,
6243                                        bool Unpacked) {
6244   if (!LoadVT.isVector())
6245     return Result;
6246 
6247   // Cast back to the original packed type or to a larger type that is a
6248   // multiple of 32 bit for D16. Widening the return type is a required for
6249   // legalization.
6250   EVT FittingLoadVT = LoadVT;
6251   if ((LoadVT.getVectorNumElements() % 2) == 1) {
6252     FittingLoadVT =
6253         EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6254                          LoadVT.getVectorNumElements() + 1);
6255   }
6256 
6257   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6258     // Truncate to v2i16/v4i16.
6259     EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6260 
6261     // Workaround legalizer not scalarizing truncate after vector op
6262     // legalization but not creating intermediate vector trunc.
6263     SmallVector<SDValue, 4> Elts;
6264     DAG.ExtractVectorElements(Result, Elts);
6265     for (SDValue &Elt : Elts)
6266       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6267 
6268     // Pad illegal v1i16/v3fi6 to v4i16
6269     if ((LoadVT.getVectorNumElements() % 2) == 1)
6270       Elts.push_back(DAG.getPOISON(MVT::i16));
6271 
6272     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6273 
6274     // Bitcast to original type (v2f16/v4f16).
6275     return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6276   }
6277 
6278   // Cast back to the original packed type.
6279   return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6280 }
6281 
6282 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6283                                               SelectionDAG &DAG,
6284                                               ArrayRef<SDValue> Ops,
6285                                               bool IsIntrinsic) const {
6286   SDLoc DL(M);
6287 
6288   bool Unpacked = Subtarget->hasUnpackedD16VMem();
6289   EVT LoadVT = M->getValueType(0);
6290 
6291   EVT EquivLoadVT = LoadVT;
6292   if (LoadVT.isVector()) {
6293     if (Unpacked) {
6294       EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6295                                      LoadVT.getVectorNumElements());
6296     } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6297       // Widen v3f16 to legal type
6298       EquivLoadVT =
6299           EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
6300                            LoadVT.getVectorNumElements() + 1);
6301     }
6302   }
6303 
6304   // Change from v4f16/v2f16 to EquivLoadVT.
6305   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6306 
6307   SDValue Load = DAG.getMemIntrinsicNode(
6308       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6309       M->getMemoryVT(), M->getMemOperand());
6310 
6311   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6312 
6313   return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6314 }
6315 
6316 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6317                                              SelectionDAG &DAG,
6318                                              ArrayRef<SDValue> Ops) const {
6319   SDLoc DL(M);
6320   EVT LoadVT = M->getValueType(0);
6321   EVT EltType = LoadVT.getScalarType();
6322   EVT IntVT = LoadVT.changeTypeToInteger();
6323 
6324   bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6325 
6326   assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6327   bool IsTFE = M->getNumValues() == 3;
6328 
6329   unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6330                                    : AMDGPUISD::BUFFER_LOAD_FORMAT)
6331                  : IsTFE  ? AMDGPUISD::BUFFER_LOAD_TFE
6332                           : AMDGPUISD::BUFFER_LOAD;
6333 
6334   if (IsD16) {
6335     return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6336   }
6337 
6338   // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6339   if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6340     return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6341                                       IsTFE);
6342 
6343   if (isTypeLegal(LoadVT)) {
6344     return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6345                                M->getMemOperand(), DAG);
6346   }
6347 
6348   EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6349   SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6350   SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6351                                         M->getMemOperand(), DAG);
6352   return DAG.getMergeValues(
6353       {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6354       DL);
6355 }
6356 
6357 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6358                                   SelectionDAG &DAG) {
6359   EVT VT = N->getValueType(0);
6360   unsigned CondCode = N->getConstantOperandVal(3);
6361   if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6362     return DAG.getPOISON(VT);
6363 
6364   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6365 
6366   SDValue LHS = N->getOperand(1);
6367   SDValue RHS = N->getOperand(2);
6368 
6369   SDLoc DL(N);
6370 
6371   EVT CmpVT = LHS.getValueType();
6372   if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6373     unsigned PromoteOp =
6374         ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6375     LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6376     RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6377   }
6378 
6379   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6380 
6381   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6382   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6383 
6384   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6385                               DAG.getCondCode(CCOpcode));
6386   if (VT.bitsEq(CCVT))
6387     return SetCC;
6388   return DAG.getZExtOrTrunc(SetCC, DL, VT);
6389 }
6390 
6391 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6392                                   SelectionDAG &DAG) {
6393   EVT VT = N->getValueType(0);
6394 
6395   unsigned CondCode = N->getConstantOperandVal(3);
6396   if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6397     return DAG.getPOISON(VT);
6398 
6399   SDValue Src0 = N->getOperand(1);
6400   SDValue Src1 = N->getOperand(2);
6401   EVT CmpVT = Src0.getValueType();
6402   SDLoc SL(N);
6403 
6404   if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6405     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6406     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6407   }
6408 
6409   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6410   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6411   unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6412   EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6413   SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6414                               DAG.getCondCode(CCOpcode));
6415   if (VT.bitsEq(CCVT))
6416     return SetCC;
6417   return DAG.getZExtOrTrunc(SetCC, SL, VT);
6418 }
6419 
6420 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6421                                     SelectionDAG &DAG) {
6422   EVT VT = N->getValueType(0);
6423   SDValue Src = N->getOperand(1);
6424   SDLoc SL(N);
6425 
6426   if (Src.getOpcode() == ISD::SETCC) {
6427     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6428     return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6429                        Src.getOperand(1), Src.getOperand(2));
6430   }
6431   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6432     // (ballot 0) -> 0
6433     if (Arg->isZero())
6434       return DAG.getConstant(0, SL, VT);
6435 
6436     // (ballot 1) -> EXEC/EXEC_LO
6437     if (Arg->isOne()) {
6438       Register Exec;
6439       if (VT.getScalarSizeInBits() == 32)
6440         Exec = AMDGPU::EXEC_LO;
6441       else if (VT.getScalarSizeInBits() == 64)
6442         Exec = AMDGPU::EXEC;
6443       else
6444         return SDValue();
6445 
6446       return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6447     }
6448   }
6449 
6450   // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6451   // ISD::SETNE)
6452   return DAG.getNode(
6453       AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6454       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6455 }
6456 
6457 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6458                            SelectionDAG &DAG) {
6459   EVT VT = N->getValueType(0);
6460   unsigned ValSize = VT.getSizeInBits();
6461   unsigned IID = N->getConstantOperandVal(0);
6462   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6463                       IID == Intrinsic::amdgcn_permlanex16;
6464   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6465                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6466   SDLoc SL(N);
6467   MVT IntVT = MVT::getIntegerVT(ValSize);
6468   const GCNSubtarget *ST = TLI.getSubtarget();
6469   unsigned SplitSize = 32;
6470   if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6471       ST->hasDPALU_DPP() &&
6472       AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6473     SplitSize = 64;
6474 
6475   auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6476                                           SDValue Src2, MVT ValT) -> SDValue {
6477     SmallVector<SDValue, 8> Operands;
6478     switch (IID) {
6479     case Intrinsic::amdgcn_permlane16:
6480     case Intrinsic::amdgcn_permlanex16:
6481     case Intrinsic::amdgcn_update_dpp:
6482       Operands.push_back(N->getOperand(6));
6483       Operands.push_back(N->getOperand(5));
6484       Operands.push_back(N->getOperand(4));
6485       [[fallthrough]];
6486     case Intrinsic::amdgcn_writelane:
6487       Operands.push_back(Src2);
6488       [[fallthrough]];
6489     case Intrinsic::amdgcn_readlane:
6490     case Intrinsic::amdgcn_set_inactive:
6491     case Intrinsic::amdgcn_set_inactive_chain_arg:
6492     case Intrinsic::amdgcn_mov_dpp8:
6493       Operands.push_back(Src1);
6494       [[fallthrough]];
6495     case Intrinsic::amdgcn_readfirstlane:
6496     case Intrinsic::amdgcn_permlane64:
6497       Operands.push_back(Src0);
6498       break;
6499     default:
6500       llvm_unreachable("unhandled lane op");
6501     }
6502 
6503     Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6504     std::reverse(Operands.begin(), Operands.end());
6505 
6506     if (SDNode *GL = N->getGluedNode()) {
6507       assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6508       GL = GL->getOperand(0).getNode();
6509       Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6510                                      SDValue(GL, 0)));
6511     }
6512 
6513     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6514   };
6515 
6516   SDValue Src0 = N->getOperand(1);
6517   SDValue Src1, Src2;
6518   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6519       IID == Intrinsic::amdgcn_mov_dpp8 ||
6520       IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6521     Src1 = N->getOperand(2);
6522     if (IID == Intrinsic::amdgcn_writelane ||
6523         IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6524       Src2 = N->getOperand(3);
6525   }
6526 
6527   if (ValSize == SplitSize) {
6528     // Already legal
6529     return SDValue();
6530   }
6531 
6532   if (ValSize < 32) {
6533     bool IsFloat = VT.isFloatingPoint();
6534     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6535                                 SL, MVT::i32);
6536 
6537     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6538       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6539                                   SL, MVT::i32);
6540     }
6541 
6542     if (IID == Intrinsic::amdgcn_writelane) {
6543       Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6544                                   SL, MVT::i32);
6545     }
6546 
6547     SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6548     SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6549     return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6550   }
6551 
6552   if (ValSize % SplitSize != 0)
6553     return SDValue();
6554 
6555   auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6556     EVT VT = N->getValueType(0);
6557     unsigned NE = VT.getVectorNumElements();
6558     EVT EltVT = VT.getVectorElementType();
6559     SmallVector<SDValue, 8> Scalars;
6560     unsigned NumOperands = N->getNumOperands();
6561     SmallVector<SDValue, 4> Operands(NumOperands);
6562     SDNode *GL = N->getGluedNode();
6563 
6564     // only handle convergencectrl_glue
6565     assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6566 
6567     for (unsigned i = 0; i != NE; ++i) {
6568       for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6569            ++j) {
6570         SDValue Operand = N->getOperand(j);
6571         EVT OperandVT = Operand.getValueType();
6572         if (OperandVT.isVector()) {
6573           // A vector operand; extract a single element.
6574           EVT OperandEltVT = OperandVT.getVectorElementType();
6575           Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6576                                     Operand, DAG.getVectorIdxConstant(i, SL));
6577         } else {
6578           // A scalar operand; just use it as is.
6579           Operands[j] = Operand;
6580         }
6581       }
6582 
6583       if (GL)
6584         Operands[NumOperands - 1] =
6585             DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6586                         SDValue(GL->getOperand(0).getNode(), 0));
6587 
6588       Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6589     }
6590 
6591     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6592     return DAG.getBuildVector(VecVT, SL, Scalars);
6593   };
6594 
6595   if (VT.isVector()) {
6596     switch (MVT::SimpleValueType EltTy =
6597                 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6598     case MVT::i32:
6599     case MVT::f32:
6600       if (SplitSize == 32) {
6601         SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6602         return unrollLaneOp(LaneOp.getNode());
6603       }
6604       [[fallthrough]];
6605     case MVT::i16:
6606     case MVT::f16:
6607     case MVT::bf16: {
6608       unsigned SubVecNumElt =
6609           SplitSize / VT.getVectorElementType().getSizeInBits();
6610       MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6611       SmallVector<SDValue, 4> Pieces;
6612       SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6613       for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6614         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6615                                  DAG.getConstant(EltIdx, SL, MVT::i32));
6616 
6617         if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6618             IsPermLane16)
6619           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6620                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6621 
6622         if (IID == Intrinsic::amdgcn_writelane)
6623           Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6624                                    DAG.getConstant(EltIdx, SL, MVT::i32));
6625 
6626         Pieces.push_back(
6627             IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6628                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6629                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6630         EltIdx += SubVecNumElt;
6631       }
6632       return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6633     }
6634     default:
6635       // Handle all other cases by bitcasting to i32 vectors
6636       break;
6637     }
6638   }
6639 
6640   MVT VecVT =
6641       MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6642   Src0 = DAG.getBitcast(VecVT, Src0);
6643 
6644   if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6645     Src1 = DAG.getBitcast(VecVT, Src1);
6646 
6647   if (IID == Intrinsic::amdgcn_writelane)
6648     Src2 = DAG.getBitcast(VecVT, Src2);
6649 
6650   SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6651   SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6652   return DAG.getBitcast(VT, UnrolledLaneOp);
6653 }
6654 
6655 void SITargetLowering::ReplaceNodeResults(SDNode *N,
6656                                           SmallVectorImpl<SDValue> &Results,
6657                                           SelectionDAG &DAG) const {
6658   switch (N->getOpcode()) {
6659   case ISD::INSERT_VECTOR_ELT: {
6660     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6661       Results.push_back(Res);
6662     return;
6663   }
6664   case ISD::EXTRACT_VECTOR_ELT: {
6665     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6666       Results.push_back(Res);
6667     return;
6668   }
6669   case ISD::INTRINSIC_WO_CHAIN: {
6670     unsigned IID = N->getConstantOperandVal(0);
6671     switch (IID) {
6672     case Intrinsic::amdgcn_make_buffer_rsrc:
6673       Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6674       return;
6675     case Intrinsic::amdgcn_cvt_pkrtz: {
6676       SDValue Src0 = N->getOperand(1);
6677       SDValue Src1 = N->getOperand(2);
6678       SDLoc SL(N);
6679       SDValue Cvt =
6680           DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6681       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6682       return;
6683     }
6684     case Intrinsic::amdgcn_cvt_pknorm_i16:
6685     case Intrinsic::amdgcn_cvt_pknorm_u16:
6686     case Intrinsic::amdgcn_cvt_pk_i16:
6687     case Intrinsic::amdgcn_cvt_pk_u16: {
6688       SDValue Src0 = N->getOperand(1);
6689       SDValue Src1 = N->getOperand(2);
6690       SDLoc SL(N);
6691       unsigned Opcode;
6692 
6693       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6694         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6695       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6696         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6697       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6698         Opcode = AMDGPUISD::CVT_PK_I16_I32;
6699       else
6700         Opcode = AMDGPUISD::CVT_PK_U16_U32;
6701 
6702       EVT VT = N->getValueType(0);
6703       if (isTypeLegal(VT))
6704         Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6705       else {
6706         SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6707         Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6708       }
6709       return;
6710     }
6711     case Intrinsic::amdgcn_s_buffer_load: {
6712       // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6713       // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6714       // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6715       // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6716       // s_buffer_load_i8.
6717       if (!Subtarget->hasScalarSubwordLoads())
6718         return;
6719       SDValue Op = SDValue(N, 0);
6720       SDValue Rsrc = Op.getOperand(1);
6721       SDValue Offset = Op.getOperand(2);
6722       SDValue CachePolicy = Op.getOperand(3);
6723       EVT VT = Op.getValueType();
6724       assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6725       SDLoc DL(Op);
6726       MachineFunction &MF = DAG.getMachineFunction();
6727       const DataLayout &DataLayout = DAG.getDataLayout();
6728       Align Alignment =
6729           DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6730       MachineMemOperand *MMO = MF.getMachineMemOperand(
6731           MachinePointerInfo(),
6732           MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6733               MachineMemOperand::MOInvariant,
6734           VT.getStoreSize(), Alignment);
6735       SDValue LoadVal;
6736       if (!Offset->isDivergent()) {
6737         SDValue Ops[] = {Rsrc, // source register
6738                          Offset, CachePolicy};
6739         SDValue BufferLoad =
6740             DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
6741                                     DAG.getVTList(MVT::i32), Ops, VT, MMO);
6742         LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6743       } else {
6744         SDValue Ops[] = {
6745             DAG.getEntryNode(),                    // Chain
6746             Rsrc,                                  // rsrc
6747             DAG.getConstant(0, DL, MVT::i32),      // vindex
6748             {},                                    // voffset
6749             {},                                    // soffset
6750             {},                                    // offset
6751             CachePolicy,                           // cachepolicy
6752             DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6753         };
6754         setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6755         LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6756       }
6757       Results.push_back(LoadVal);
6758       return;
6759     }
6760     case Intrinsic::amdgcn_dead: {
6761       for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6762         Results.push_back(DAG.getPOISON(N->getValueType(I)));
6763       return;
6764     }
6765     }
6766     break;
6767   }
6768   case ISD::INTRINSIC_W_CHAIN: {
6769     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6770       if (Res.getOpcode() == ISD::MERGE_VALUES) {
6771         // FIXME: Hacky
6772         for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6773           Results.push_back(Res.getOperand(I));
6774         }
6775       } else {
6776         Results.push_back(Res);
6777         Results.push_back(Res.getValue(1));
6778       }
6779       return;
6780     }
6781 
6782     break;
6783   }
6784   case ISD::SELECT: {
6785     SDLoc SL(N);
6786     EVT VT = N->getValueType(0);
6787     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6788     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6789     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6790 
6791     EVT SelectVT = NewVT;
6792     if (NewVT.bitsLT(MVT::i32)) {
6793       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6794       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6795       SelectVT = MVT::i32;
6796     }
6797 
6798     SDValue NewSelect =
6799         DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
6800 
6801     if (NewVT != SelectVT)
6802       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6803     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6804     return;
6805   }
6806   case ISD::FNEG: {
6807     if (N->getValueType(0) != MVT::v2f16)
6808       break;
6809 
6810     SDLoc SL(N);
6811     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6812 
6813     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
6814                              DAG.getConstant(0x80008000, SL, MVT::i32));
6815     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6816     return;
6817   }
6818   case ISD::FABS: {
6819     if (N->getValueType(0) != MVT::v2f16)
6820       break;
6821 
6822     SDLoc SL(N);
6823     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6824 
6825     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
6826                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6827     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6828     return;
6829   }
6830   case ISD::FSQRT: {
6831     if (N->getValueType(0) != MVT::f16)
6832       break;
6833     Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6834     break;
6835   }
6836   default:
6837     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6838     break;
6839   }
6840 }
6841 
6842 /// Helper function for LowerBRCOND
6843 static SDNode *findUser(SDValue Value, unsigned Opcode) {
6844 
6845   for (SDUse &U : Value->uses()) {
6846     if (U.get() != Value)
6847       continue;
6848 
6849     if (U.getUser()->getOpcode() == Opcode)
6850       return U.getUser();
6851   }
6852   return nullptr;
6853 }
6854 
6855 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6856   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6857     switch (Intr->getConstantOperandVal(1)) {
6858     case Intrinsic::amdgcn_if:
6859       return AMDGPUISD::IF;
6860     case Intrinsic::amdgcn_else:
6861       return AMDGPUISD::ELSE;
6862     case Intrinsic::amdgcn_loop:
6863       return AMDGPUISD::LOOP;
6864     case Intrinsic::amdgcn_end_cf:
6865       llvm_unreachable("should not occur");
6866     default:
6867       return 0;
6868     }
6869   }
6870 
6871   // break, if_break, else_break are all only used as inputs to loop, not
6872   // directly as branch conditions.
6873   return 0;
6874 }
6875 
6876 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6877   const Triple &TT = getTargetMachine().getTargetTriple();
6878   return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6879           GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6880          AMDGPU::shouldEmitConstantsToTextSection(TT);
6881 }
6882 
6883 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6884   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6885     return false;
6886 
6887   // FIXME: Either avoid relying on address space here or change the default
6888   // address space for functions to avoid the explicit check.
6889   return (GV->getValueType()->isFunctionTy() ||
6890           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
6891          !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6892 }
6893 
6894 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6895   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6896 }
6897 
6898 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6899   if (!GV->hasExternalLinkage())
6900     return true;
6901 
6902   const auto OS = getTargetMachine().getTargetTriple().getOS();
6903   return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6904 }
6905 
6906 /// This transforms the control flow intrinsics to get the branch destination as
6907 /// last parameter, also switches branch target with BR if the need arise
6908 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6909   SDLoc DL(BRCOND);
6910 
6911   SDNode *Intr = BRCOND.getOperand(1).getNode();
6912   SDValue Target = BRCOND.getOperand(2);
6913   SDNode *BR = nullptr;
6914   SDNode *SetCC = nullptr;
6915 
6916   if (Intr->getOpcode() == ISD::SETCC) {
6917     // As long as we negate the condition everything is fine
6918     SetCC = Intr;
6919     Intr = SetCC->getOperand(0).getNode();
6920 
6921   } else {
6922     // Get the target from BR if we don't negate the condition
6923     BR = findUser(BRCOND, ISD::BR);
6924     assert(BR && "brcond missing unconditional branch user");
6925     Target = BR->getOperand(1);
6926   }
6927 
6928   unsigned CFNode = isCFIntrinsic(Intr);
6929   if (CFNode == 0) {
6930     // This is a uniform branch so we don't need to legalize.
6931     return BRCOND;
6932   }
6933 
6934   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6935                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6936 
6937   assert(!SetCC ||
6938          (SetCC->getConstantOperandVal(1) == 1 &&
6939           cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6940               ISD::SETNE));
6941 
6942   // operands of the new intrinsic call
6943   SmallVector<SDValue, 4> Ops;
6944   if (HaveChain)
6945     Ops.push_back(BRCOND.getOperand(0));
6946 
6947   Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6948   Ops.push_back(Target);
6949 
6950   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6951 
6952   // build the new intrinsic call
6953   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6954 
6955   if (!HaveChain) {
6956     SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
6957 
6958     Result = DAG.getMergeValues(Ops, DL).getNode();
6959   }
6960 
6961   if (BR) {
6962     // Give the branch instruction our target
6963     SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
6964     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6965     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6966   }
6967 
6968   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6969 
6970   // Copy the intrinsic results to registers
6971   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6972     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
6973     if (!CopyToReg)
6974       continue;
6975 
6976     Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
6977                              SDValue(Result, i - 1), SDValue());
6978 
6979     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6980   }
6981 
6982   // Remove the old intrinsic from the chain
6983   DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
6984                                 Intr->getOperand(0));
6985 
6986   return Chain;
6987 }
6988 
6989 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6990   MVT VT = Op.getSimpleValueType();
6991   SDLoc DL(Op);
6992   // Checking the depth
6993   if (Op.getConstantOperandVal(0) != 0)
6994     return DAG.getConstant(0, DL, VT);
6995 
6996   MachineFunction &MF = DAG.getMachineFunction();
6997   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6998   // Check for kernel and shader functions
6999   if (Info->isEntryFunction())
7000     return DAG.getConstant(0, DL, VT);
7001 
7002   MachineFrameInfo &MFI = MF.getFrameInfo();
7003   // There is a call to @llvm.returnaddress in this function
7004   MFI.setReturnAddressIsTaken(true);
7005 
7006   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7007   // Get the return address reg and mark it as an implicit live-in
7008   Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7009                               getRegClassFor(VT, Op.getNode()->isDivergent()));
7010 
7011   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7012 }
7013 
7014 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7015                                             const SDLoc &DL, EVT VT) const {
7016   return Op.getValueType().bitsLE(VT)
7017              ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7018              : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7019                            DAG.getTargetConstant(0, DL, MVT::i32));
7020 }
7021 
7022 SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7023                                                 SelectionDAG &DAG) const {
7024   EVT DstVT = Op.getValueType();
7025   unsigned NumElts = DstVT.getVectorNumElements();
7026   assert(NumElts > 2 && isPowerOf2_32(NumElts));
7027 
7028   auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7029 
7030   SDLoc DL(Op);
7031   unsigned Opc = Op.getOpcode();
7032   SDValue Flags = Op.getOperand(1);
7033   EVT HalfDstVT =
7034       EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7035   SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7036   SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7037 
7038   return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7039 }
7040 
7041 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7042   SDValue Src = Op.getOperand(0);
7043   EVT SrcVT = Src.getValueType();
7044   EVT DstVT = Op.getValueType();
7045 
7046   if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7047     assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7048     if (SrcVT.getScalarType() != MVT::f32)
7049       return SDValue();
7050     return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7051   }
7052 
7053   if (SrcVT.getScalarType() != MVT::f64)
7054     return Op;
7055 
7056   SDLoc DL(Op);
7057   if (DstVT == MVT::f16) {
7058     // TODO: Handle strictfp
7059     if (Op.getOpcode() != ISD::FP_ROUND)
7060       return Op;
7061 
7062     if (!Subtarget->has16BitInsts()) {
7063       SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7064       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7065       return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7066     }
7067     if (getTargetMachine().Options.UnsafeFPMath) {
7068       SDValue Flags = Op.getOperand(1);
7069       SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7070       return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7071     }
7072     SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7073     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7074     return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7075   }
7076 
7077   assert(DstVT.getScalarType() == MVT::bf16 &&
7078          "custom lower FP_ROUND for f16 or bf16");
7079   assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7080 
7081   // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7082   // hardware f32 -> bf16 instruction.
7083   EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7084                                  MVT::f32;
7085   SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7086   return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7087                      DAG.getTargetConstant(0, DL, MVT::i32));
7088 }
7089 
7090 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7091                                                SelectionDAG &DAG) const {
7092   EVT VT = Op.getValueType();
7093   const MachineFunction &MF = DAG.getMachineFunction();
7094   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7095   bool IsIEEEMode = Info->getMode().IEEE;
7096 
7097   // FIXME: Assert during selection that this is only selected for
7098   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7099   // mode functions, but this happens to be OK since it's only done in cases
7100   // where there is known no sNaN.
7101   if (IsIEEEMode)
7102     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7103 
7104   if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7105       VT == MVT::v16bf16)
7106     return splitBinaryVectorOp(Op, DAG);
7107   return Op;
7108 }
7109 
7110 SDValue
7111 SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7112                                                SelectionDAG &DAG) const {
7113   EVT VT = Op.getValueType();
7114   const MachineFunction &MF = DAG.getMachineFunction();
7115   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7116   bool IsIEEEMode = Info->getMode().IEEE;
7117 
7118   if (IsIEEEMode)
7119     return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7120 
7121   if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7122       VT == MVT::v16bf16)
7123     return splitBinaryVectorOp(Op, DAG);
7124   return Op;
7125 }
7126 
7127 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7128                                                  SelectionDAG &DAG) const {
7129   EVT VT = Op.getValueType();
7130   if (VT.isVector())
7131     return splitBinaryVectorOp(Op, DAG);
7132 
7133   assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7134          !Subtarget->hasMinimum3Maximum3F16() &&
7135          Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7136          "should not need to widen f16 minimum/maximum to v2f16");
7137 
7138   // Widen f16 operation to v2f16
7139 
7140   // fminimum f16:x, f16:y ->
7141   //   extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7142   //                                (v2f16 (scalar_to_vector y))), 0
7143   SDLoc SL(Op);
7144   SDValue WideSrc0 =
7145       DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7146   SDValue WideSrc1 =
7147       DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7148 
7149   SDValue Widened =
7150       DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7151 
7152   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7153                      DAG.getConstant(0, SL, MVT::i32));
7154 }
7155 
7156 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7157   bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7158   EVT VT = Op.getValueType();
7159   assert(VT == MVT::f16);
7160 
7161   SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7162   EVT ExpVT = Exp.getValueType();
7163   if (ExpVT == MVT::i16)
7164     return Op;
7165 
7166   SDLoc DL(Op);
7167 
7168   // Correct the exponent type for f16 to i16.
7169   // Clamp the range of the exponent to the instruction's range.
7170 
7171   // TODO: This should be a generic narrowing legalization, and can easily be
7172   // for GlobalISel.
7173 
7174   SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7175   SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7176 
7177   SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7178   SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7179 
7180   SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7181 
7182   if (IsStrict) {
7183     return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7184                        {Op.getOperand(0), Op.getOperand(1), TruncExp});
7185   }
7186 
7187   return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7188 }
7189 
7190 static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
7191   switch (Op->getOpcode()) {
7192   case ISD::SRA:
7193   case ISD::SMIN:
7194   case ISD::SMAX:
7195     return ISD::SIGN_EXTEND;
7196   case ISD::SRL:
7197   case ISD::UMIN:
7198   case ISD::UMAX:
7199     return ISD::ZERO_EXTEND;
7200   case ISD::ADD:
7201   case ISD::SUB:
7202   case ISD::AND:
7203   case ISD::OR:
7204   case ISD::XOR:
7205   case ISD::SHL:
7206   case ISD::SELECT:
7207   case ISD::MUL:
7208     // operation result won't be influenced by garbage high bits.
7209     // TODO: are all of those cases correct, and are there more?
7210     return ISD::ANY_EXTEND;
7211   case ISD::SETCC: {
7212     ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7213     return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7214   }
7215   default:
7216     llvm_unreachable("unexpected opcode!");
7217   }
7218 }
7219 
7220 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7221                                                 DAGCombinerInfo &DCI) const {
7222   const unsigned Opc = Op.getOpcode();
7223   assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7224          Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7225          Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7226          Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7227          Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7228 
7229   EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7230                                  : Op->getOperand(0).getValueType();
7231   auto ExtTy = OpTy.changeElementType(MVT::i32);
7232 
7233   if (DCI.isBeforeLegalizeOps() ||
7234       isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7235     return SDValue();
7236 
7237   auto &DAG = DCI.DAG;
7238 
7239   SDLoc DL(Op);
7240   SDValue LHS;
7241   SDValue RHS;
7242   if (Opc == ISD::SELECT) {
7243     LHS = Op->getOperand(1);
7244     RHS = Op->getOperand(2);
7245   } else {
7246     LHS = Op->getOperand(0);
7247     RHS = Op->getOperand(1);
7248   }
7249 
7250   const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7251   LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7252 
7253   // Special case: for shifts, the RHS always needs a zext.
7254   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7255     RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7256   else
7257     RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7258 
7259   // setcc always return i1/i1 vec so no need to truncate after.
7260   if (Opc == ISD::SETCC) {
7261     ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7262     return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7263   }
7264 
7265   // For other ops, we extend the operation's return type as well so we need to
7266   // truncate back to the original type.
7267   SDValue NewVal;
7268   if (Opc == ISD::SELECT)
7269     NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7270   else
7271     NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7272 
7273   return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7274 }
7275 
7276 SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7277   SDValue Mag = Op.getOperand(0);
7278   EVT MagVT = Mag.getValueType();
7279 
7280   if (MagVT.getVectorNumElements() > 2)
7281     return splitBinaryVectorOp(Op, DAG);
7282 
7283   SDValue Sign = Op.getOperand(1);
7284   EVT SignVT = Sign.getValueType();
7285 
7286   if (MagVT == SignVT)
7287     return Op;
7288 
7289   // fcopysign v2f16:mag, v2f32:sign ->
7290   //   fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7291 
7292   SDLoc SL(Op);
7293   SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7294   SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7295 
7296   SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7297 
7298   return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7299 }
7300 
7301 // Custom lowering for vector multiplications and s_mul_u64.
7302 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7303   EVT VT = Op.getValueType();
7304 
7305   // Split vector operands.
7306   if (VT.isVector())
7307     return splitBinaryVectorOp(Op, DAG);
7308 
7309   assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7310 
7311   // There are four ways to lower s_mul_u64:
7312   //
7313   // 1. If all the operands are uniform, then we lower it as it is.
7314   //
7315   // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7316   //    multiplications because there is not a vector equivalent of s_mul_u64.
7317   //
7318   // 3. If the cost model decides that it is more efficient to use vector
7319   //    registers, then we have to split s_mul_u64 in 32-bit multiplications.
7320   //    This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7321   //
7322   // 4. If the cost model decides to use vector registers and both of the
7323   //    operands are zero-extended/sign-extended from 32-bits, then we split the
7324   //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7325   //    possible to check if the operands are zero-extended or sign-extended in
7326   //    SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7327   //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7328   //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7329   //    If the cost model decides that we have to use vector registers, then
7330   //    splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7331   //    s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7332   //    decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7333   //    s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7334   //    SIInstrInfo.cpp .
7335 
7336   if (Op->isDivergent())
7337     return SDValue();
7338 
7339   SDValue Op0 = Op.getOperand(0);
7340   SDValue Op1 = Op.getOperand(1);
7341   // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7342   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7343   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7344   KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7345   unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7346   KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7347   unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7348   SDLoc SL(Op);
7349   if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7350     return SDValue(
7351         DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7352   unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7353   unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7354   if (Op0SignBits >= 33 && Op1SignBits >= 33)
7355     return SDValue(
7356         DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7357   // If all the operands are uniform, then we lower s_mul_u64 as it is.
7358   return Op;
7359 }
7360 
7361 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7362   EVT VT = Op.getValueType();
7363   SDLoc SL(Op);
7364   SDValue LHS = Op.getOperand(0);
7365   SDValue RHS = Op.getOperand(1);
7366   bool isSigned = Op.getOpcode() == ISD::SMULO;
7367 
7368   if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7369     const APInt &C = RHSC->getAPIntValue();
7370     // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7371     if (C.isPowerOf2()) {
7372       // smulo(x, signed_min) is same as umulo(x, signed_min).
7373       bool UseArithShift = isSigned && !C.isMinSignedValue();
7374       SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7375       SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7376       SDValue Overflow =
7377           DAG.getSetCC(SL, MVT::i1,
7378                        DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7379                                    Result, ShiftAmt),
7380                        LHS, ISD::SETNE);
7381       return DAG.getMergeValues({Result, Overflow}, SL);
7382     }
7383   }
7384 
7385   SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7386   SDValue Top =
7387       DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7388 
7389   SDValue Sign = isSigned
7390                      ? DAG.getNode(ISD::SRA, SL, VT, Result,
7391                                    DAG.getConstant(VT.getScalarSizeInBits() - 1,
7392                                                    SL, MVT::i32))
7393                      : DAG.getConstant(0, SL, VT);
7394   SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7395 
7396   return DAG.getMergeValues({Result, Overflow}, SL);
7397 }
7398 
7399 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7400   if (Op->isDivergent()) {
7401     // Select to V_MAD_[IU]64_[IU]32.
7402     return Op;
7403   }
7404   if (Subtarget->hasSMulHi()) {
7405     // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7406     return SDValue();
7407   }
7408   // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7409   // calculate the high part, so we might as well do the whole thing with
7410   // V_MAD_[IU]64_[IU]32.
7411   return Op;
7412 }
7413 
7414 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7415   if (!Subtarget->isTrapHandlerEnabled() ||
7416       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7417     return lowerTrapEndpgm(Op, DAG);
7418 
7419   return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7420                                             : lowerTrapHsaQueuePtr(Op, DAG);
7421 }
7422 
7423 SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7424   SDLoc SL(Op);
7425   SDValue Chain = Op.getOperand(0);
7426   return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7427 }
7428 
7429 SDValue
7430 SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7431                                              const SDLoc &DL, Align Alignment,
7432                                              ImplicitParameter Param) const {
7433   MachineFunction &MF = DAG.getMachineFunction();
7434   uint64_t Offset = getImplicitParameterOffset(MF, Param);
7435   SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7436   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7437   return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7438                      MachineMemOperand::MODereferenceable |
7439                          MachineMemOperand::MOInvariant);
7440 }
7441 
7442 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7443                                                SelectionDAG &DAG) const {
7444   SDLoc SL(Op);
7445   SDValue Chain = Op.getOperand(0);
7446 
7447   SDValue QueuePtr;
7448   // For code object version 5, QueuePtr is passed through implicit kernarg.
7449   const Module *M = DAG.getMachineFunction().getFunction().getParent();
7450   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7451     QueuePtr =
7452         loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7453   } else {
7454     MachineFunction &MF = DAG.getMachineFunction();
7455     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7456     Register UserSGPR = Info->getQueuePtrUserSGPR();
7457 
7458     if (UserSGPR == AMDGPU::NoRegister) {
7459       // We probably are in a function incorrectly marked with
7460       // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7461       // trap, so just use a null pointer.
7462       QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7463     } else {
7464       QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7465                                       MVT::i64);
7466     }
7467   }
7468 
7469   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7470   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7471 
7472   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7473   SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7474                    ToReg.getValue(1)};
7475   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7476 }
7477 
7478 SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7479   SDLoc SL(Op);
7480   SDValue Chain = Op.getOperand(0);
7481 
7482   // We need to simulate the 's_trap 2' instruction on targets that run in
7483   // PRIV=1 (where it is treated as a nop).
7484   if (Subtarget->hasPrivEnabledTrap2NopBug())
7485     return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7486 
7487   uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7488   SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7489   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7490 }
7491 
7492 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7493   SDLoc SL(Op);
7494   SDValue Chain = Op.getOperand(0);
7495   MachineFunction &MF = DAG.getMachineFunction();
7496 
7497   if (!Subtarget->isTrapHandlerEnabled() ||
7498       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7499     LLVMContext &Ctx = MF.getFunction().getContext();
7500     Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
7501                                            "debugtrap handler not supported",
7502                                            Op.getDebugLoc(), DS_Warning));
7503     return Chain;
7504   }
7505 
7506   uint64_t TrapID =
7507       static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7508   SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7509   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7510 }
7511 
7512 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7513                                              SelectionDAG &DAG) const {
7514   if (Subtarget->hasApertureRegs()) {
7515     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7516                                        ? AMDGPU::SRC_SHARED_BASE
7517                                        : AMDGPU::SRC_PRIVATE_BASE;
7518     // Note: this feature (register) is broken. When used as a 32-bit operand,
7519     // it returns a wrong value (all zeroes?). The real value is in the upper 32
7520     // bits.
7521     //
7522     // To work around the issue, directly emit a 64 bit mov from this register
7523     // then extract the high bits. Note that this shouldn't even result in a
7524     // shift being emitted and simply become a pair of registers (e.g.):
7525     //    s_mov_b64 s[6:7], src_shared_base
7526     //    v_mov_b32_e32 v1, s7
7527     //
7528     // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7529     // coalescing would kick in and it would think it's okay to use the "HI"
7530     // subregister directly (instead of extracting the HI 32 bits) which is an
7531     // artificial (unusable) register.
7532     //  Register TableGen definitions would need an overhaul to get rid of the
7533     //  artificial "HI" aperture registers and prevent this kind of issue from
7534     //  happening.
7535     SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7536                                      DAG.getRegister(ApertureRegNo, MVT::i64));
7537     return DAG.getNode(
7538         ISD::TRUNCATE, DL, MVT::i32,
7539         DAG.getNode(ISD::SRL, DL, MVT::i64,
7540                     {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7541   }
7542 
7543   // For code object version 5, private_base and shared_base are passed through
7544   // implicit kernargs.
7545   const Module *M = DAG.getMachineFunction().getFunction().getParent();
7546   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
7547     ImplicitParameter Param =
7548         (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7549     return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7550   }
7551 
7552   MachineFunction &MF = DAG.getMachineFunction();
7553   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7554   Register UserSGPR = Info->getQueuePtrUserSGPR();
7555   if (UserSGPR == AMDGPU::NoRegister) {
7556     // We probably are in a function incorrectly marked with
7557     // amdgpu-no-queue-ptr. This is undefined.
7558     return DAG.getPOISON(MVT::i32);
7559   }
7560 
7561   SDValue QueuePtr =
7562       CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7563 
7564   // Offset into amd_queue_t for group_segment_aperture_base_hi /
7565   // private_segment_aperture_base_hi.
7566   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7567 
7568   SDValue Ptr =
7569       DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7570 
7571   // TODO: Use custom target PseudoSourceValue.
7572   // TODO: We should use the value from the IR intrinsic call, but it might not
7573   // be available and how do we get it?
7574   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7575   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7576                      commonAlignment(Align(64), StructOffset),
7577                      MachineMemOperand::MODereferenceable |
7578                          MachineMemOperand::MOInvariant);
7579 }
7580 
7581 /// Return true if the value is a known valid address, such that a null check is
7582 /// not necessary.
7583 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7584                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7585   if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7586     return true;
7587 
7588   if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7589     return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7590 
7591   // TODO: Search through arithmetic, handle arguments and loads
7592   // marked nonnull.
7593   return false;
7594 }
7595 
7596 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7597                                              SelectionDAG &DAG) const {
7598   SDLoc SL(Op);
7599 
7600   const AMDGPUTargetMachine &TM =
7601       static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7602 
7603   unsigned DestAS, SrcAS;
7604   SDValue Src;
7605   bool IsNonNull = false;
7606   if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7607     SrcAS = ASC->getSrcAddressSpace();
7608     Src = ASC->getOperand(0);
7609     DestAS = ASC->getDestAddressSpace();
7610   } else {
7611     assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7612            Op.getConstantOperandVal(0) ==
7613                Intrinsic::amdgcn_addrspacecast_nonnull);
7614     Src = Op->getOperand(1);
7615     SrcAS = Op->getConstantOperandVal(2);
7616     DestAS = Op->getConstantOperandVal(3);
7617     IsNonNull = true;
7618   }
7619 
7620   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7621 
7622   // flat -> local/private
7623   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7624     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7625         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7626       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7627 
7628       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7629         return Ptr;
7630 
7631       unsigned NullVal = TM.getNullPointerValue(DestAS);
7632       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7633       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7634 
7635       return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7636                          SegmentNullPtr);
7637     }
7638   }
7639 
7640   // local/private -> flat
7641   if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7642     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7643         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7644 
7645       SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7646       SDValue CvtPtr =
7647           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7648       CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7649 
7650       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7651         return CvtPtr;
7652 
7653       unsigned NullVal = TM.getNullPointerValue(SrcAS);
7654       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7655 
7656       SDValue NonNull =
7657           DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7658 
7659       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7660                          FlatNullPtr);
7661     }
7662   }
7663 
7664   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7665       Op.getValueType() == MVT::i64) {
7666     const SIMachineFunctionInfo *Info =
7667         DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7668     SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7669     SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7670     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7671   }
7672 
7673   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7674       Src.getValueType() == MVT::i64)
7675     return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7676 
7677   // global <-> flat are no-ops and never emitted.
7678 
7679   // Invalid casts are poison.
7680   return DAG.getPOISON(Op->getValueType(0));
7681 }
7682 
7683 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7684 // the small vector and inserting them into the big vector. That is better than
7685 // the default expansion of doing it via a stack slot. Even though the use of
7686 // the stack slot would be optimized away afterwards, the stack slot itself
7687 // remains.
7688 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7689                                                 SelectionDAG &DAG) const {
7690   SDValue Vec = Op.getOperand(0);
7691   SDValue Ins = Op.getOperand(1);
7692   SDValue Idx = Op.getOperand(2);
7693   EVT VecVT = Vec.getValueType();
7694   EVT InsVT = Ins.getValueType();
7695   EVT EltVT = VecVT.getVectorElementType();
7696   unsigned InsNumElts = InsVT.getVectorNumElements();
7697   unsigned IdxVal = Idx->getAsZExtVal();
7698   SDLoc SL(Op);
7699 
7700   if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7701     // Insert 32-bit registers at a time.
7702     assert(InsNumElts % 2 == 0 && "expect legal vector types");
7703 
7704     unsigned VecNumElts = VecVT.getVectorNumElements();
7705     EVT NewVecVT =
7706         EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7707     EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7708                                    : EVT::getVectorVT(*DAG.getContext(),
7709                                                       MVT::i32, InsNumElts / 2);
7710 
7711     Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7712     Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7713 
7714     for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7715       SDValue Elt;
7716       if (InsNumElts == 2) {
7717         Elt = Ins;
7718       } else {
7719         Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7720                           DAG.getConstant(I, SL, MVT::i32));
7721       }
7722       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7723                         DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7724     }
7725 
7726     return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7727   }
7728 
7729   for (unsigned I = 0; I != InsNumElts; ++I) {
7730     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7731                               DAG.getConstant(I, SL, MVT::i32));
7732     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7733                       DAG.getConstant(IdxVal + I, SL, MVT::i32));
7734   }
7735   return Vec;
7736 }
7737 
7738 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7739                                                  SelectionDAG &DAG) const {
7740   SDValue Vec = Op.getOperand(0);
7741   SDValue InsVal = Op.getOperand(1);
7742   SDValue Idx = Op.getOperand(2);
7743   EVT VecVT = Vec.getValueType();
7744   EVT EltVT = VecVT.getVectorElementType();
7745   unsigned VecSize = VecVT.getSizeInBits();
7746   unsigned EltSize = EltVT.getSizeInBits();
7747   SDLoc SL(Op);
7748 
7749   // Specially handle the case of v4i16 with static indexing.
7750   unsigned NumElts = VecVT.getVectorNumElements();
7751   auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
7752   if (NumElts == 4 && EltSize == 16 && KIdx) {
7753     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7754 
7755     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7756                                  DAG.getConstant(0, SL, MVT::i32));
7757     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7758                                  DAG.getConstant(1, SL, MVT::i32));
7759 
7760     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7761     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7762 
7763     unsigned Idx = KIdx->getZExtValue();
7764     bool InsertLo = Idx < 2;
7765     SDValue InsHalf = DAG.getNode(
7766         ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
7767         DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7768         DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7769 
7770     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7771 
7772     SDValue Concat =
7773         InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
7774                  : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
7775 
7776     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7777   }
7778 
7779   // Static indexing does not lower to stack access, and hence there is no need
7780   // for special custom lowering to avoid stack access.
7781   if (isa<ConstantSDNode>(Idx))
7782     return SDValue();
7783 
7784   // Avoid stack access for dynamic indexing by custom lowering to
7785   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7786 
7787   assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7788 
7789   MVT IntVT = MVT::getIntegerVT(VecSize);
7790 
7791   // Convert vector index to bit-index and get the required bit mask.
7792   assert(isPowerOf2_32(EltSize));
7793   const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7794   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7795   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7796   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7797                             DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7798 
7799   // 1. Create a congruent vector with the target value in each element.
7800   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7801                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
7802 
7803   // 2. Mask off all other indices except the required index within (1).
7804   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7805 
7806   // 3. Mask off the required index within the target vector.
7807   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7808   SDValue RHS =
7809       DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
7810 
7811   // 4. Get (2) and (3) ORed into the target vector.
7812   SDValue BFI =
7813       DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
7814 
7815   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7816 }
7817 
7818 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7819                                                   SelectionDAG &DAG) const {
7820   SDLoc SL(Op);
7821 
7822   EVT ResultVT = Op.getValueType();
7823   SDValue Vec = Op.getOperand(0);
7824   SDValue Idx = Op.getOperand(1);
7825   EVT VecVT = Vec.getValueType();
7826   unsigned VecSize = VecVT.getSizeInBits();
7827   EVT EltVT = VecVT.getVectorElementType();
7828 
7829   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7830 
7831   // Make sure we do any optimizations that will make it easier to fold
7832   // source modifiers before obscuring it with bit operations.
7833 
7834   // XXX - Why doesn't this get called when vector_shuffle is expanded?
7835   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7836     return Combined;
7837 
7838   if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7839     SDValue Lo, Hi;
7840     auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
7841 
7842     if (VecSize == 128) {
7843       SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7844       Lo = DAG.getBitcast(LoVT,
7845                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7846                                       DAG.getConstant(0, SL, MVT::i32)));
7847       Hi = DAG.getBitcast(HiVT,
7848                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7849                                       DAG.getConstant(1, SL, MVT::i32)));
7850     } else if (VecSize == 256) {
7851       SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7852       SDValue Parts[4];
7853       for (unsigned P = 0; P < 4; ++P) {
7854         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7855                                DAG.getConstant(P, SL, MVT::i32));
7856       }
7857 
7858       Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7859                                             Parts[0], Parts[1]));
7860       Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7861                                             Parts[2], Parts[3]));
7862     } else {
7863       assert(VecSize == 512);
7864 
7865       SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7866       SDValue Parts[8];
7867       for (unsigned P = 0; P < 8; ++P) {
7868         Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7869                                DAG.getConstant(P, SL, MVT::i32));
7870       }
7871 
7872       Lo = DAG.getBitcast(LoVT,
7873                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7874                                       Parts[0], Parts[1], Parts[2], Parts[3]));
7875       Hi = DAG.getBitcast(HiVT,
7876                           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7877                                       Parts[4], Parts[5], Parts[6], Parts[7]));
7878     }
7879 
7880     EVT IdxVT = Idx.getValueType();
7881     unsigned NElem = VecVT.getVectorNumElements();
7882     assert(isPowerOf2_32(NElem));
7883     SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7884     SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7885     SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7886     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7887   }
7888 
7889   assert(VecSize <= 64);
7890 
7891   MVT IntVT = MVT::getIntegerVT(VecSize);
7892 
7893   // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7894   SDValue VecBC = peekThroughBitcasts(Vec);
7895   if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7896     SDValue Src = VecBC.getOperand(0);
7897     Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7898     Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7899   }
7900 
7901   unsigned EltSize = EltVT.getSizeInBits();
7902   assert(isPowerOf2_32(EltSize));
7903 
7904   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7905 
7906   // Convert vector index to bit-index (* EltSize)
7907   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7908 
7909   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7910   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7911 
7912   if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7913     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7914     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7915   }
7916 
7917   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7918 }
7919 
7920 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7921   assert(Elt % 2 == 0);
7922   return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7923 }
7924 
7925 static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7926   assert(Elt % 2 == 0);
7927   return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7928          !(Mask[Elt + 1] & 1);
7929 }
7930 
7931 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7932                                               SelectionDAG &DAG) const {
7933   SDLoc SL(Op);
7934   EVT ResultVT = Op.getValueType();
7935   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7936   MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7937   const int NewSrcNumElts = 2;
7938   MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
7939   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7940 
7941   // Break up the shuffle into registers sized pieces.
7942   //
7943   // We're trying to form sub-shuffles that the register allocation pipeline
7944   // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7945   // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7946   // pair of copies into a consecutive register copy, so use the ordinary
7947   // extract_vector_elt lowering unless we can use the shuffle.
7948   //
7949   // TODO: This is a bit of hack, and we should probably always use
7950   // extract_subvector for the largest possible subvector we can (or at least
7951   // use it for PackVT aligned pieces). However we have worse support for
7952   // combines on them don't directly treat extract_subvector / insert_subvector
7953   // as legal. The DAG scheduler also ends up doing a worse job with the
7954   // extract_subvectors.
7955   const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7956 
7957   // vector_shuffle <0,1,6,7> lhs, rhs
7958   // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7959   //
7960   // vector_shuffle <6,7,2,3> lhs, rhs
7961   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7962   //
7963   // vector_shuffle <6,7,0,1> lhs, rhs
7964   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7965 
7966   // Avoid scalarizing when both halves are reading from consecutive elements.
7967 
7968   // If we're treating 2 element shuffles as legal, also create odd-to-even
7969   // shuffles of neighboring pairs.
7970   //
7971   // vector_shuffle <3,2,7,6> lhs, rhs
7972   //  -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7973   //                    vector_shuffle <1, 0> (extract_subvector rhs, 2)
7974 
7975   SmallVector<SDValue, 16> Pieces;
7976   for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7977     if (ShouldUseConsecutiveExtract &&
7978         elementPairIsContiguous(SVN->getMask(), I)) {
7979       const int Idx = SVN->getMaskElt(I);
7980       int VecIdx = Idx < SrcNumElts ? 0 : 1;
7981       int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7982       SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
7983                                    SVN->getOperand(VecIdx),
7984                                    DAG.getConstant(EltIdx, SL, MVT::i32));
7985       Pieces.push_back(SubVec);
7986     } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
7987                isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) {
7988       int Idx0 = SVN->getMaskElt(I);
7989       int Idx1 = SVN->getMaskElt(I + 1);
7990 
7991       SDValue SrcOp0 = SVN->getOperand(0);
7992       SDValue SrcOp1 = SrcOp0;
7993       if (Idx0 >= SrcNumElts) {
7994         SrcOp0 = SVN->getOperand(1);
7995         Idx0 -= SrcNumElts;
7996       }
7997 
7998       if (Idx1 >= SrcNumElts) {
7999         SrcOp1 = SVN->getOperand(1);
8000         Idx1 -= SrcNumElts;
8001       }
8002 
8003       int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8004       int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8005 
8006       // Extract nearest even aligned piece.
8007       SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8008                                     DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8009       SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8010                                     DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8011 
8012       int NewMaskIdx0 = Idx0 - AlignedIdx0;
8013       int NewMaskIdx1 = Idx1 - AlignedIdx1;
8014 
8015       SDValue Result0 = SubVec0;
8016       SDValue Result1 = SubVec0;
8017 
8018       if (SubVec0 != SubVec1) {
8019         NewMaskIdx1 += NewSrcNumElts;
8020         Result1 = SubVec1;
8021       } else {
8022         Result1 = DAG.getPOISON(PackVT);
8023       }
8024 
8025       SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8026                                           {NewMaskIdx0, NewMaskIdx1});
8027       Pieces.push_back(Shuf);
8028     } else {
8029       const int Idx0 = SVN->getMaskElt(I);
8030       const int Idx1 = SVN->getMaskElt(I + 1);
8031       int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8032       int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8033       int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8034       int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8035 
8036       SDValue Vec0 = SVN->getOperand(VecIdx0);
8037       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8038                                  DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8039 
8040       SDValue Vec1 = SVN->getOperand(VecIdx1);
8041       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8042                                  DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8043       Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8044     }
8045   }
8046 
8047   return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8048 }
8049 
8050 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8051                                                 SelectionDAG &DAG) const {
8052   SDValue SVal = Op.getOperand(0);
8053   EVT ResultVT = Op.getValueType();
8054   EVT SValVT = SVal.getValueType();
8055   SDValue UndefVal = DAG.getPOISON(SValVT);
8056   SDLoc SL(Op);
8057 
8058   SmallVector<SDValue, 8> VElts;
8059   VElts.push_back(SVal);
8060   for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8061     VElts.push_back(UndefVal);
8062 
8063   return DAG.getBuildVector(ResultVT, SL, VElts);
8064 }
8065 
8066 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8067                                             SelectionDAG &DAG) const {
8068   SDLoc SL(Op);
8069   EVT VT = Op.getValueType();
8070 
8071   if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8072     assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8073 
8074     SDValue Lo = Op.getOperand(0);
8075     SDValue Hi = Op.getOperand(1);
8076 
8077     // Avoid adding defined bits with the zero_extend.
8078     if (Hi.isUndef()) {
8079       Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8080       SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8081       return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8082     }
8083 
8084     Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8085     Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8086 
8087     SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8088                                 DAG.getConstant(16, SL, MVT::i32));
8089     if (Lo.isUndef())
8090       return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8091 
8092     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8093     Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8094 
8095     SDValue Or =
8096         DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8097     return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8098   }
8099 
8100   // Split into 2-element chunks.
8101   const unsigned NumParts = VT.getVectorNumElements() / 2;
8102   EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8103   MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8104 
8105   SmallVector<SDValue> Casts;
8106   for (unsigned P = 0; P < NumParts; ++P) {
8107     SDValue Vec = DAG.getBuildVector(
8108         PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8109     Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8110   }
8111 
8112   SDValue Blend =
8113       DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8114   return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8115 }
8116 
8117 bool SITargetLowering::isOffsetFoldingLegal(
8118     const GlobalAddressSDNode *GA) const {
8119   // OSes that use ELF REL relocations (instead of RELA) can only store a
8120   // 32-bit addend in the instruction, so it is not safe to allow offset folding
8121   // which can create arbitrary 64-bit addends. (This is only a problem for
8122   // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8123   // the high 32 bits of the addend.)
8124   //
8125   // This should be kept in sync with how HasRelocationAddend is initialized in
8126   // the constructor of ELFAMDGPUAsmBackend.
8127   if (!Subtarget->isAmdHsaOS())
8128     return false;
8129 
8130   // We can fold offsets for anything that doesn't require a GOT relocation.
8131   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8132           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
8133           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
8134          !shouldEmitGOTReloc(GA->getGlobal());
8135 }
8136 
8137 static SDValue
8138 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
8139                         const SDLoc &DL, int64_t Offset, EVT PtrVT,
8140                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
8141   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8142   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8143   // lowered to the following code sequence:
8144   //
8145   // For constant address space:
8146   //   s_getpc_b64 s[0:1]
8147   //   s_add_u32 s0, s0, $symbol
8148   //   s_addc_u32 s1, s1, 0
8149   //
8150   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
8151   //   a fixup or relocation is emitted to replace $symbol with a literal
8152   //   constant, which is a pc-relative offset from the encoding of the $symbol
8153   //   operand to the global variable.
8154   //
8155   // For global address space:
8156   //   s_getpc_b64 s[0:1]
8157   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8158   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8159   //
8160   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
8161   //   fixups or relocations are emitted to replace $symbol@*@lo and
8162   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8163   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
8164   //   operand to the global variable.
8165   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8166   SDValue PtrHi;
8167   if (GAFlags == SIInstrInfo::MO_NONE)
8168     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8169   else
8170     PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8171   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8172 }
8173 
8174 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8175                                              SDValue Op,
8176                                              SelectionDAG &DAG) const {
8177   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8178   SDLoc DL(GSD);
8179   EVT PtrVT = Op.getValueType();
8180 
8181   const GlobalValue *GV = GSD->getGlobal();
8182   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8183        shouldUseLDSConstAddress(GV)) ||
8184       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
8185       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
8186     if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8187         GV->hasExternalLinkage()) {
8188       Type *Ty = GV->getValueType();
8189       // HIP uses an unsized array `extern __shared__ T s[]` or similar
8190       // zero-sized type in other languages to declare the dynamic shared
8191       // memory which size is not known at the compile time. They will be
8192       // allocated by the runtime and placed directly after the static
8193       // allocated ones. They all share the same offset.
8194       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8195         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8196         // Adjust alignment for that dynamic shared memory array.
8197         Function &F = DAG.getMachineFunction().getFunction();
8198         MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
8199         MFI->setUsesDynamicLDS(true);
8200         return SDValue(
8201             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8202       }
8203     }
8204     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
8205   }
8206 
8207   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8208     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8209                                             SIInstrInfo::MO_ABS32_LO);
8210     return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8211   }
8212 
8213   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8214     SDValue AddrLo = DAG.getTargetGlobalAddress(
8215         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8216     AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8217 
8218     SDValue AddrHi = DAG.getTargetGlobalAddress(
8219         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8220     AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8221 
8222     return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8223   }
8224 
8225   if (shouldEmitFixup(GV))
8226     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8227 
8228   if (shouldEmitPCReloc(GV))
8229     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8230                                    SIInstrInfo::MO_REL32);
8231 
8232   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8233                                             SIInstrInfo::MO_GOTPCREL32);
8234   PointerType *PtrTy =
8235       PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS);
8236   const DataLayout &DataLayout = DAG.getDataLayout();
8237   Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8238   MachinePointerInfo PtrInfo =
8239       MachinePointerInfo::getGOT(DAG.getMachineFunction());
8240 
8241   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8242                      MachineMemOperand::MODereferenceable |
8243                          MachineMemOperand::MOInvariant);
8244 }
8245 
8246 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
8247                                    const SDLoc &DL, SDValue V) const {
8248   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8249   // the destination register.
8250   //
8251   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8252   // so we will end up with redundant moves to m0.
8253   //
8254   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8255 
8256   // A Null SDValue creates a glue result.
8257   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8258                                   V, Chain);
8259   return SDValue(M0, 0);
8260 }
8261 
8262 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8263                                                  MVT VT,
8264                                                  unsigned Offset) const {
8265   SDLoc SL(Op);
8266   SDValue Param = lowerKernargMemParameter(
8267       DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8268   // The local size values will have the hi 16-bits as zero.
8269   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8270                      DAG.getValueType(VT));
8271 }
8272 
8273 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8274                                         EVT VT) {
8275   DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8276       DAG.getMachineFunction().getFunction(),
8277       "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8278   return DAG.getPOISON(VT);
8279 }
8280 
8281 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8282                                          EVT VT) {
8283   DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8284       DAG.getMachineFunction().getFunction(),
8285       "intrinsic not supported on subtarget", DL.getDebugLoc()));
8286   return DAG.getPOISON(VT);
8287 }
8288 
8289 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
8290                                     ArrayRef<SDValue> Elts) {
8291   assert(!Elts.empty());
8292   MVT Type;
8293   unsigned NumElts = Elts.size();
8294 
8295   if (NumElts <= 12) {
8296     Type = MVT::getVectorVT(MVT::f32, NumElts);
8297   } else {
8298     assert(Elts.size() <= 16);
8299     Type = MVT::v16f32;
8300     NumElts = 16;
8301   }
8302 
8303   SmallVector<SDValue, 16> VecElts(NumElts);
8304   for (unsigned i = 0; i < Elts.size(); ++i) {
8305     SDValue Elt = Elts[i];
8306     if (Elt.getValueType() != MVT::f32)
8307       Elt = DAG.getBitcast(MVT::f32, Elt);
8308     VecElts[i] = Elt;
8309   }
8310   for (unsigned i = Elts.size(); i < NumElts; ++i)
8311     VecElts[i] = DAG.getPOISON(MVT::f32);
8312 
8313   if (NumElts == 1)
8314     return VecElts[0];
8315   return DAG.getBuildVector(Type, DL, VecElts);
8316 }
8317 
8318 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8319                               SDValue Src, int ExtraElts) {
8320   EVT SrcVT = Src.getValueType();
8321 
8322   SmallVector<SDValue, 8> Elts;
8323 
8324   if (SrcVT.isVector())
8325     DAG.ExtractVectorElements(Src, Elts);
8326   else
8327     Elts.push_back(Src);
8328 
8329   SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8330   while (ExtraElts--)
8331     Elts.push_back(Undef);
8332 
8333   return DAG.getBuildVector(CastVT, DL, Elts);
8334 }
8335 
8336 // Re-construct the required return value for a image load intrinsic.
8337 // This is more complicated due to the optional use TexFailCtrl which means the
8338 // required return type is an aggregate
8339 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
8340                                  ArrayRef<EVT> ResultTypes, bool IsTexFail,
8341                                  bool Unpacked, bool IsD16, int DMaskPop,
8342                                  int NumVDataDwords, bool IsAtomicPacked16Bit,
8343                                  const SDLoc &DL) {
8344   // Determine the required return type. This is the same regardless of
8345   // IsTexFail flag
8346   EVT ReqRetVT = ResultTypes[0];
8347   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8348   int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8349                           ? (ReqRetNumElts + 1) / 2
8350                           : ReqRetNumElts;
8351 
8352   int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8353 
8354   MVT DataDwordVT =
8355       NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8356 
8357   MVT MaskPopVT =
8358       MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8359 
8360   SDValue Data(Result, 0);
8361   SDValue TexFail;
8362 
8363   if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8364     SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8365     if (MaskPopVT.isVector()) {
8366       Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8367                          SDValue(Result, 0), ZeroIdx);
8368     } else {
8369       Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8370                          SDValue(Result, 0), ZeroIdx);
8371     }
8372   }
8373 
8374   if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8375     Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8376                           NumDataDwords - MaskPopDwords);
8377 
8378   if (IsD16)
8379     Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8380 
8381   EVT LegalReqRetVT = ReqRetVT;
8382   if (!ReqRetVT.isVector()) {
8383     if (!Data.getValueType().isInteger())
8384       Data = DAG.getNode(ISD::BITCAST, DL,
8385                          Data.getValueType().changeTypeToInteger(), Data);
8386     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8387   } else {
8388     // We need to widen the return vector to a legal type
8389     if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8390         ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8391       LegalReqRetVT =
8392           EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
8393                            ReqRetVT.getVectorNumElements() + 1);
8394     }
8395   }
8396   Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8397 
8398   if (IsTexFail) {
8399     TexFail =
8400         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8401                     DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8402 
8403     return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8404   }
8405 
8406   if (Result->getNumValues() == 1)
8407     return Data;
8408 
8409   return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8410 }
8411 
8412 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8413                          SDValue *LWE, bool &IsTexFail) {
8414   auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8415 
8416   uint64_t Value = TexFailCtrlConst->getZExtValue();
8417   if (Value) {
8418     IsTexFail = true;
8419   }
8420 
8421   SDLoc DL(TexFailCtrlConst);
8422   *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8423   Value &= ~(uint64_t)0x1;
8424   *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8425   Value &= ~(uint64_t)0x2;
8426 
8427   return Value == 0;
8428 }
8429 
8430 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
8431                                       MVT PackVectorVT,
8432                                       SmallVectorImpl<SDValue> &PackedAddrs,
8433                                       unsigned DimIdx, unsigned EndIdx,
8434                                       unsigned NumGradients) {
8435   SDLoc DL(Op);
8436   for (unsigned I = DimIdx; I < EndIdx; I++) {
8437     SDValue Addr = Op.getOperand(I);
8438 
8439     // Gradients are packed with undef for each coordinate.
8440     // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8441     // 1D: undef,dx/dh; undef,dx/dv
8442     // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8443     // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8444     if (((I + 1) >= EndIdx) ||
8445         ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8446                                          I == DimIdx + NumGradients - 1))) {
8447       if (Addr.getValueType() != MVT::i16)
8448         Addr = DAG.getBitcast(MVT::i16, Addr);
8449       Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8450     } else {
8451       Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8452       I++;
8453     }
8454     Addr = DAG.getBitcast(MVT::f32, Addr);
8455     PackedAddrs.push_back(Addr);
8456   }
8457 }
8458 
8459 SDValue SITargetLowering::lowerImage(SDValue Op,
8460                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
8461                                      SelectionDAG &DAG, bool WithChain) const {
8462   SDLoc DL(Op);
8463   MachineFunction &MF = DAG.getMachineFunction();
8464   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8465   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8466       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
8467   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8468   unsigned IntrOpcode = Intr->BaseOpcode;
8469   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8470   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8471   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8472 
8473   SmallVector<EVT, 3> ResultTypes(Op->values());
8474   SmallVector<EVT, 3> OrigResultTypes(Op->values());
8475   bool IsD16 = false;
8476   bool IsG16 = false;
8477   bool IsA16 = false;
8478   SDValue VData;
8479   int NumVDataDwords = 0;
8480   bool AdjustRetType = false;
8481   bool IsAtomicPacked16Bit = false;
8482 
8483   // Offset of intrinsic arguments
8484   const unsigned ArgOffset = WithChain ? 2 : 1;
8485 
8486   unsigned DMask;
8487   unsigned DMaskLanes = 0;
8488 
8489   if (BaseOpcode->Atomic) {
8490     VData = Op.getOperand(2);
8491 
8492     IsAtomicPacked16Bit =
8493         (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8494          Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8495 
8496     bool Is64Bit = VData.getValueSizeInBits() == 64;
8497     if (BaseOpcode->AtomicX2) {
8498       SDValue VData2 = Op.getOperand(3);
8499       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8500                                  {VData, VData2});
8501       if (Is64Bit)
8502         VData = DAG.getBitcast(MVT::v4i32, VData);
8503 
8504       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8505       DMask = Is64Bit ? 0xf : 0x3;
8506       NumVDataDwords = Is64Bit ? 4 : 2;
8507     } else {
8508       DMask = Is64Bit ? 0x3 : 0x1;
8509       NumVDataDwords = Is64Bit ? 2 : 1;
8510     }
8511   } else {
8512     DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8513     DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8514 
8515     if (BaseOpcode->Store) {
8516       VData = Op.getOperand(2);
8517 
8518       MVT StoreVT = VData.getSimpleValueType();
8519       if (StoreVT.getScalarType() == MVT::f16) {
8520         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8521           return Op; // D16 is unsupported for this instruction
8522 
8523         IsD16 = true;
8524         VData = handleD16VData(VData, DAG, true);
8525       }
8526 
8527       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8528     } else if (!BaseOpcode->NoReturn) {
8529       // Work out the num dwords based on the dmask popcount and underlying type
8530       // and whether packing is supported.
8531       MVT LoadVT = ResultTypes[0].getSimpleVT();
8532       if (LoadVT.getScalarType() == MVT::f16) {
8533         if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8534           return Op; // D16 is unsupported for this instruction
8535 
8536         IsD16 = true;
8537       }
8538 
8539       // Confirm that the return type is large enough for the dmask specified
8540       if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8541           (!LoadVT.isVector() && DMaskLanes > 1))
8542         return Op;
8543 
8544       // The sq block of gfx8 and gfx9 do not estimate register use correctly
8545       // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8546       // instructions.
8547       if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8548           !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8549         NumVDataDwords = (DMaskLanes + 1) / 2;
8550       else
8551         NumVDataDwords = DMaskLanes;
8552 
8553       AdjustRetType = true;
8554     }
8555   }
8556 
8557   unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8558   SmallVector<SDValue, 4> VAddrs;
8559 
8560   // Check for 16 bit addresses or derivatives and pack if true.
8561   MVT VAddrVT =
8562       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8563   MVT VAddrScalarVT = VAddrVT.getScalarType();
8564   MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8565   IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8566 
8567   VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8568   VAddrScalarVT = VAddrVT.getScalarType();
8569   MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8570   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8571 
8572   // Push back extra arguments.
8573   for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8574     if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8575       assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8576       // Special handling of bias when A16 is on. Bias is of type half but
8577       // occupies full 32-bit.
8578       SDValue Bias = DAG.getBuildVector(
8579           MVT::v2f16, DL,
8580           {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
8581       VAddrs.push_back(Bias);
8582     } else {
8583       assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8584              "Bias needs to be converted to 16 bit in A16 mode");
8585       VAddrs.push_back(Op.getOperand(ArgOffset + I));
8586     }
8587   }
8588 
8589   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8590     // 16 bit gradients are supported, but are tied to the A16 control
8591     // so both gradients and addresses must be 16 bit
8592     LLVM_DEBUG(
8593         dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8594                   "require 16 bit args for both gradients and addresses");
8595     return Op;
8596   }
8597 
8598   if (IsA16) {
8599     if (!ST->hasA16()) {
8600       LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8601                            "support 16 bit addresses\n");
8602       return Op;
8603     }
8604   }
8605 
8606   // We've dealt with incorrect input so we know that if IsA16, IsG16
8607   // are set then we have to compress/pack operands (either address,
8608   // gradient or both)
8609   // In the case where a16 and gradients are tied (no G16 support) then we
8610   // have already verified that both IsA16 and IsG16 are true
8611   if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8612     // Activate g16
8613     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8614         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
8615     IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8616   }
8617 
8618   // Add gradients (packed or unpacked)
8619   if (IsG16) {
8620     // Pack the gradients
8621     // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8622     packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8623                               ArgOffset + Intr->GradientStart,
8624                               ArgOffset + Intr->CoordStart, Intr->NumGradients);
8625   } else {
8626     for (unsigned I = ArgOffset + Intr->GradientStart;
8627          I < ArgOffset + Intr->CoordStart; I++)
8628       VAddrs.push_back(Op.getOperand(I));
8629   }
8630 
8631   // Add addresses (packed or unpacked)
8632   if (IsA16) {
8633     packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8634                               ArgOffset + Intr->CoordStart, VAddrEnd,
8635                               0 /* No gradients */);
8636   } else {
8637     // Add uncompressed address
8638     for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8639       VAddrs.push_back(Op.getOperand(I));
8640   }
8641 
8642   // If the register allocator cannot place the address registers contiguously
8643   // without introducing moves, then using the non-sequential address encoding
8644   // is always preferable, since it saves VALU instructions and is usually a
8645   // wash in terms of code size or even better.
8646   //
8647   // However, we currently have no way of hinting to the register allocator that
8648   // MIMG addresses should be placed contiguously when it is possible to do so,
8649   // so force non-NSA for the common 2-address case as a heuristic.
8650   //
8651   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8652   // allocation when possible.
8653   //
8654   // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8655   // set of the remaining addresses.
8656   const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8657   const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8658   const bool UseNSA = ST->hasNSAEncoding() &&
8659                       VAddrs.size() >= ST->getNSAThreshold(MF) &&
8660                       (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8661   const bool UsePartialNSA =
8662       UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8663 
8664   SDValue VAddr;
8665   if (UsePartialNSA) {
8666     VAddr = getBuildDwordsVector(DAG, DL,
8667                                  ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8668   } else if (!UseNSA) {
8669     VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8670   }
8671 
8672   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8673   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8674   SDValue Unorm;
8675   if (!BaseOpcode->Sampler) {
8676     Unorm = True;
8677   } else {
8678     uint64_t UnormConst =
8679         Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8680 
8681     Unorm = UnormConst ? True : False;
8682   }
8683 
8684   SDValue TFE;
8685   SDValue LWE;
8686   SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8687   bool IsTexFail = false;
8688   if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8689     return Op;
8690 
8691   if (IsTexFail) {
8692     if (!DMaskLanes) {
8693       // Expecting to get an error flag since TFC is on - and dmask is 0
8694       // Force dmask to be at least 1 otherwise the instruction will fail
8695       DMask = 0x1;
8696       DMaskLanes = 1;
8697       NumVDataDwords = 1;
8698     }
8699     NumVDataDwords += 1;
8700     AdjustRetType = true;
8701   }
8702 
8703   // Has something earlier tagged that the return type needs adjusting
8704   // This happens if the instruction is a load or has set TexFailCtrl flags
8705   if (AdjustRetType) {
8706     // NumVDataDwords reflects the true number of dwords required in the return
8707     // type
8708     if (DMaskLanes == 0 && !BaseOpcode->Store) {
8709       // This is a no-op load. This can be eliminated
8710       SDValue Undef = DAG.getPOISON(Op.getValueType());
8711       if (isa<MemSDNode>(Op))
8712         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8713       return Undef;
8714     }
8715 
8716     EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
8717                                                       MVT::i32, NumVDataDwords)
8718                                    : MVT::i32;
8719 
8720     ResultTypes[0] = NewVT;
8721     if (ResultTypes.size() == 3) {
8722       // Original result was aggregate type used for TexFailCtrl results
8723       // The actual instruction returns as a vector type which has now been
8724       // created. Remove the aggregate result.
8725       ResultTypes.erase(&ResultTypes[1]);
8726     }
8727   }
8728 
8729   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8730   if (BaseOpcode->Atomic)
8731     CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8732   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8733                AMDGPU::CPol::VOLATILE))
8734     return Op;
8735 
8736   SmallVector<SDValue, 26> Ops;
8737   if (BaseOpcode->Store || BaseOpcode->Atomic)
8738     Ops.push_back(VData); // vdata
8739   if (UsePartialNSA) {
8740     append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8741     Ops.push_back(VAddr);
8742   } else if (UseNSA)
8743     append_range(Ops, VAddrs);
8744   else
8745     Ops.push_back(VAddr);
8746   SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8747   EVT RsrcVT = Rsrc.getValueType();
8748   if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8749     return Op;
8750   Ops.push_back(Rsrc);
8751   if (BaseOpcode->Sampler) {
8752     SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8753     if (Samp.getValueType() != MVT::v4i32)
8754       return Op;
8755     Ops.push_back(Samp);
8756   }
8757   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8758   if (IsGFX10Plus)
8759     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8760   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8761     Ops.push_back(Unorm);
8762   Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8763   Ops.push_back(IsA16 && // r128, a16 for gfx9
8764                         ST->hasFeature(AMDGPU::FeatureR128A16)
8765                     ? True
8766                     : False);
8767   if (IsGFX10Plus)
8768     Ops.push_back(IsA16 ? True : False);
8769 
8770   if (!Subtarget->hasGFX90AInsts())
8771     Ops.push_back(TFE); // tfe
8772   else if (TFE->getAsZExtVal()) {
8773     DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8774         DAG.getMachineFunction().getFunction(),
8775         "TFE is not supported on this GPU", DL.getDebugLoc()));
8776   }
8777 
8778   if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8779     Ops.push_back(LWE); // lwe
8780   if (!IsGFX10Plus)
8781     Ops.push_back(DimInfo->DA ? True : False);
8782   if (BaseOpcode->HasD16)
8783     Ops.push_back(IsD16 ? True : False);
8784   if (isa<MemSDNode>(Op))
8785     Ops.push_back(Op.getOperand(0)); // chain
8786 
8787   int NumVAddrDwords =
8788       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8789   int Opcode = -1;
8790 
8791   if (IsGFX12Plus) {
8792     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8793                                    NumVDataDwords, NumVAddrDwords);
8794   } else if (IsGFX11Plus) {
8795     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8796                                    UseNSA ? AMDGPU::MIMGEncGfx11NSA
8797                                           : AMDGPU::MIMGEncGfx11Default,
8798                                    NumVDataDwords, NumVAddrDwords);
8799   } else if (IsGFX10Plus) {
8800     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8801                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
8802                                           : AMDGPU::MIMGEncGfx10Default,
8803                                    NumVDataDwords, NumVAddrDwords);
8804   } else {
8805     if (Subtarget->hasGFX90AInsts()) {
8806       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8807                                      NumVDataDwords, NumVAddrDwords);
8808       if (Opcode == -1) {
8809         DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8810             DAG.getMachineFunction().getFunction(),
8811             "requested image instruction is not supported on this GPU",
8812             DL.getDebugLoc()));
8813 
8814         unsigned Idx = 0;
8815         SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
8816         for (EVT VT : OrigResultTypes) {
8817           if (VT == MVT::Other)
8818             RetValues[Idx++] = Op.getOperand(0); // Chain
8819           else
8820             RetValues[Idx++] = DAG.getPOISON(VT);
8821         }
8822 
8823         return DAG.getMergeValues(RetValues, DL);
8824       }
8825     }
8826     if (Opcode == -1 &&
8827         Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8828       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8829                                      NumVDataDwords, NumVAddrDwords);
8830     if (Opcode == -1)
8831       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8832                                      NumVDataDwords, NumVAddrDwords);
8833   }
8834   if (Opcode == -1)
8835     return Op;
8836 
8837   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8838   if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
8839     MachineMemOperand *MemRef = MemOp->getMemOperand();
8840     DAG.setNodeMemRefs(NewNode, {MemRef});
8841   }
8842 
8843   if (BaseOpcode->AtomicX2) {
8844     SmallVector<SDValue, 1> Elt;
8845     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8846     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8847   }
8848   if (BaseOpcode->NoReturn)
8849     return SDValue(NewNode, 0);
8850   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8851                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8852                            NumVDataDwords, IsAtomicPacked16Bit, DL);
8853 }
8854 
8855 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8856                                        SDValue Offset, SDValue CachePolicy,
8857                                        SelectionDAG &DAG) const {
8858   MachineFunction &MF = DAG.getMachineFunction();
8859 
8860   const DataLayout &DataLayout = DAG.getDataLayout();
8861   Align Alignment =
8862       DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
8863 
8864   MachineMemOperand *MMO = MF.getMachineMemOperand(
8865       MachinePointerInfo(),
8866       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8867           MachineMemOperand::MOInvariant,
8868       VT.getStoreSize(), Alignment);
8869 
8870   if (!Offset->isDivergent()) {
8871     SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8872 
8873     // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8874     // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8875     // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8876     // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8877     if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8878       SDValue BufferLoad =
8879           DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
8880                                   DAG.getVTList(MVT::i32), Ops, VT, MMO);
8881       return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8882     }
8883 
8884     // Widen vec3 load to vec4.
8885     if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8886         !Subtarget->hasScalarDwordx3Loads()) {
8887       EVT WidenedVT =
8888           EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
8889       auto WidenedOp = DAG.getMemIntrinsicNode(
8890           AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8891           MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8892       auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8893                                    DAG.getVectorIdxConstant(0, DL));
8894       return Subvector;
8895     }
8896 
8897     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
8898                                    DAG.getVTList(VT), Ops, VT, MMO);
8899   }
8900 
8901   // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8902   // assume that the buffer is unswizzled.
8903   SDValue Ops[] = {
8904       DAG.getEntryNode(),                    // Chain
8905       Rsrc,                                  // rsrc
8906       DAG.getConstant(0, DL, MVT::i32),      // vindex
8907       {},                                    // voffset
8908       {},                                    // soffset
8909       {},                                    // offset
8910       CachePolicy,                           // cachepolicy
8911       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8912   };
8913   if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8914     setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8915     return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8916   }
8917 
8918   SmallVector<SDValue, 4> Loads;
8919   unsigned NumLoads = 1;
8920   MVT LoadVT = VT.getSimpleVT();
8921   unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8922   assert((LoadVT.getScalarType() == MVT::i32 ||
8923           LoadVT.getScalarType() == MVT::f32));
8924 
8925   if (NumElts == 8 || NumElts == 16) {
8926     NumLoads = NumElts / 4;
8927     LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8928   }
8929 
8930   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
8931 
8932   // Use the alignment to ensure that the required offsets will fit into the
8933   // immediate offsets.
8934   setBufferOffsets(Offset, DAG, &Ops[3],
8935                    NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8936 
8937   uint64_t InstOffset = Ops[5]->getAsZExtVal();
8938   for (unsigned i = 0; i < NumLoads; ++i) {
8939     Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8940     Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8941                                         LoadVT, MMO, DAG));
8942   }
8943 
8944   if (NumElts == 8 || NumElts == 16)
8945     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8946 
8947   return Loads[0];
8948 }
8949 
8950 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8951   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8952   if (!Subtarget->hasArchitectedSGPRs())
8953     return {};
8954   SDLoc SL(Op);
8955   MVT VT = MVT::i32;
8956   SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8957   return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8958                      DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8959 }
8960 
8961 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8962                                           unsigned Dim,
8963                                           const ArgDescriptor &Arg) const {
8964   SDLoc SL(Op);
8965   MachineFunction &MF = DAG.getMachineFunction();
8966   unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8967   if (MaxID == 0)
8968     return DAG.getConstant(0, SL, MVT::i32);
8969 
8970   // It's undefined behavior if a function marked with the amdgpu-no-*
8971   // attributes uses the corresponding intrinsic.
8972   if (!Arg)
8973     return DAG.getPOISON(Op->getValueType(0));
8974 
8975   SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8976                                SDLoc(DAG.getEntryNode()), Arg);
8977 
8978   // Don't bother inserting AssertZext for packed IDs since we're emitting the
8979   // masking operations anyway.
8980   //
8981   // TODO: We could assert the top bit is 0 for the source copy.
8982   if (Arg.isMasked())
8983     return Val;
8984 
8985   // Preserve the known bits after expansion to a copy.
8986   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
8987   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8988                      DAG.getValueType(SmallVT));
8989 }
8990 
8991 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8992                                                   SelectionDAG &DAG) const {
8993   MachineFunction &MF = DAG.getMachineFunction();
8994   auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8995 
8996   EVT VT = Op.getValueType();
8997   SDLoc DL(Op);
8998   unsigned IntrinsicID = Op.getConstantOperandVal(0);
8999 
9000   // TODO: Should this propagate fast-math-flags?
9001 
9002   switch (IntrinsicID) {
9003   case Intrinsic::amdgcn_implicit_buffer_ptr: {
9004     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9005       return emitNonHSAIntrinsicError(DAG, DL, VT);
9006     return getPreloadedValue(DAG, *MFI, VT,
9007                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
9008   }
9009   case Intrinsic::amdgcn_dispatch_ptr:
9010   case Intrinsic::amdgcn_queue_ptr: {
9011     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9012       DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9013           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9014           DL.getDebugLoc()));
9015       return DAG.getPOISON(VT);
9016     }
9017 
9018     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9019                      ? AMDGPUFunctionArgInfo::DISPATCH_PTR
9020                      : AMDGPUFunctionArgInfo::QUEUE_PTR;
9021     return getPreloadedValue(DAG, *MFI, VT, RegID);
9022   }
9023   case Intrinsic::amdgcn_implicitarg_ptr: {
9024     if (MFI->isEntryFunction())
9025       return getImplicitArgPtr(DAG, DL);
9026     return getPreloadedValue(DAG, *MFI, VT,
9027                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
9028   }
9029   case Intrinsic::amdgcn_kernarg_segment_ptr: {
9030     if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
9031       // This only makes sense to call in a kernel, so just lower to null.
9032       return DAG.getConstant(0, DL, VT);
9033     }
9034 
9035     return getPreloadedValue(DAG, *MFI, VT,
9036                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
9037   }
9038   case Intrinsic::amdgcn_dispatch_id: {
9039     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9040   }
9041   case Intrinsic::amdgcn_rcp:
9042     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9043   case Intrinsic::amdgcn_rsq:
9044     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9045   case Intrinsic::amdgcn_rsq_legacy:
9046     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9047       return emitRemovedIntrinsicError(DAG, DL, VT);
9048     return SDValue();
9049   case Intrinsic::amdgcn_rcp_legacy:
9050     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9051       return emitRemovedIntrinsicError(DAG, DL, VT);
9052     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9053   case Intrinsic::amdgcn_rsq_clamp: {
9054     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9055       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9056 
9057     Type *Type = VT.getTypeForEVT(*DAG.getContext());
9058     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9059     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9060 
9061     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9062     SDValue Tmp =
9063         DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9064     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9065                        DAG.getConstantFP(Min, DL, VT));
9066   }
9067   case Intrinsic::r600_read_ngroups_x:
9068     if (Subtarget->isAmdHsaOS())
9069       return emitNonHSAIntrinsicError(DAG, DL, VT);
9070 
9071     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9072                                     SI::KernelInputOffsets::NGROUPS_X, Align(4),
9073                                     false);
9074   case Intrinsic::r600_read_ngroups_y:
9075     if (Subtarget->isAmdHsaOS())
9076       return emitNonHSAIntrinsicError(DAG, DL, VT);
9077 
9078     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9079                                     SI::KernelInputOffsets::NGROUPS_Y, Align(4),
9080                                     false);
9081   case Intrinsic::r600_read_ngroups_z:
9082     if (Subtarget->isAmdHsaOS())
9083       return emitNonHSAIntrinsicError(DAG, DL, VT);
9084 
9085     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9086                                     SI::KernelInputOffsets::NGROUPS_Z, Align(4),
9087                                     false);
9088   case Intrinsic::r600_read_local_size_x:
9089     if (Subtarget->isAmdHsaOS())
9090       return emitNonHSAIntrinsicError(DAG, DL, VT);
9091 
9092     return lowerImplicitZextParam(DAG, Op, MVT::i16,
9093                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
9094   case Intrinsic::r600_read_local_size_y:
9095     if (Subtarget->isAmdHsaOS())
9096       return emitNonHSAIntrinsicError(DAG, DL, VT);
9097 
9098     return lowerImplicitZextParam(DAG, Op, MVT::i16,
9099                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
9100   case Intrinsic::r600_read_local_size_z:
9101     if (Subtarget->isAmdHsaOS())
9102       return emitNonHSAIntrinsicError(DAG, DL, VT);
9103 
9104     return lowerImplicitZextParam(DAG, Op, MVT::i16,
9105                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
9106   case Intrinsic::amdgcn_workgroup_id_x:
9107     return getPreloadedValue(DAG, *MFI, VT,
9108                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
9109   case Intrinsic::amdgcn_workgroup_id_y:
9110     return getPreloadedValue(DAG, *MFI, VT,
9111                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
9112   case Intrinsic::amdgcn_workgroup_id_z:
9113     return getPreloadedValue(DAG, *MFI, VT,
9114                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
9115   case Intrinsic::amdgcn_wave_id:
9116     return lowerWaveID(DAG, Op);
9117   case Intrinsic::amdgcn_lds_kernel_id: {
9118     if (MFI->isEntryFunction())
9119       return getLDSKernelId(DAG, DL);
9120     return getPreloadedValue(DAG, *MFI, VT,
9121                              AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
9122   }
9123   case Intrinsic::amdgcn_workitem_id_x:
9124     return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9125   case Intrinsic::amdgcn_workitem_id_y:
9126     return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9127   case Intrinsic::amdgcn_workitem_id_z:
9128     return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9129   case Intrinsic::amdgcn_wavefrontsize:
9130     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9131                            SDLoc(Op), MVT::i32);
9132   case Intrinsic::amdgcn_s_buffer_load: {
9133     unsigned CPol = Op.getConstantOperandVal(3);
9134     // s_buffer_load, because of how it's optimized, can't be volatile
9135     // so reject ones with the volatile bit set.
9136     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9137                      ? AMDGPU::CPol::ALL
9138                      : AMDGPU::CPol::ALL_pregfx12))
9139       return Op;
9140     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9141                         Op.getOperand(3), DAG);
9142   }
9143   case Intrinsic::amdgcn_fdiv_fast:
9144     return lowerFDIV_FAST(Op, DAG);
9145   case Intrinsic::amdgcn_sin:
9146     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9147 
9148   case Intrinsic::amdgcn_cos:
9149     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9150 
9151   case Intrinsic::amdgcn_mul_u24:
9152     return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9153                        Op.getOperand(2));
9154   case Intrinsic::amdgcn_mul_i24:
9155     return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9156                        Op.getOperand(2));
9157 
9158   case Intrinsic::amdgcn_log_clamp: {
9159     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9160       return SDValue();
9161 
9162     return emitRemovedIntrinsicError(DAG, DL, VT);
9163   }
9164   case Intrinsic::amdgcn_fract:
9165     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9166 
9167   case Intrinsic::amdgcn_class:
9168     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9169                        Op.getOperand(2));
9170   case Intrinsic::amdgcn_div_fmas:
9171     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9172                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9173 
9174   case Intrinsic::amdgcn_div_fixup:
9175     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9176                        Op.getOperand(2), Op.getOperand(3));
9177 
9178   case Intrinsic::amdgcn_div_scale: {
9179     const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9180 
9181     // Translate to the operands expected by the machine instruction. The
9182     // first parameter must be the same as the first instruction.
9183     SDValue Numerator = Op.getOperand(1);
9184     SDValue Denominator = Op.getOperand(2);
9185 
9186     // Note this order is opposite of the machine instruction's operations,
9187     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9188     // intrinsic has the numerator as the first operand to match a normal
9189     // division operation.
9190 
9191     SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9192 
9193     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9194                        Denominator, Numerator);
9195   }
9196   case Intrinsic::amdgcn_icmp: {
9197     // There is a Pat that handles this variant, so return it as-is.
9198     if (Op.getOperand(1).getValueType() == MVT::i1 &&
9199         Op.getConstantOperandVal(2) == 0 &&
9200         Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9201       return Op;
9202     return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9203   }
9204   case Intrinsic::amdgcn_fcmp: {
9205     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9206   }
9207   case Intrinsic::amdgcn_ballot:
9208     return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9209   case Intrinsic::amdgcn_fmed3:
9210     return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9211                        Op.getOperand(2), Op.getOperand(3));
9212   case Intrinsic::amdgcn_fdot2:
9213     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9214                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9215   case Intrinsic::amdgcn_fmul_legacy:
9216     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9217                        Op.getOperand(2));
9218   case Intrinsic::amdgcn_sffbh:
9219     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9220   case Intrinsic::amdgcn_sbfe:
9221     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9222                        Op.getOperand(2), Op.getOperand(3));
9223   case Intrinsic::amdgcn_ubfe:
9224     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9225                        Op.getOperand(2), Op.getOperand(3));
9226   case Intrinsic::amdgcn_cvt_pkrtz:
9227   case Intrinsic::amdgcn_cvt_pknorm_i16:
9228   case Intrinsic::amdgcn_cvt_pknorm_u16:
9229   case Intrinsic::amdgcn_cvt_pk_i16:
9230   case Intrinsic::amdgcn_cvt_pk_u16: {
9231     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9232     EVT VT = Op.getValueType();
9233     unsigned Opcode;
9234 
9235     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9236       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
9237     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9238       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
9239     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9240       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
9241     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9242       Opcode = AMDGPUISD::CVT_PK_I16_I32;
9243     else
9244       Opcode = AMDGPUISD::CVT_PK_U16_U32;
9245 
9246     if (isTypeLegal(VT))
9247       return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9248 
9249     SDValue Node =
9250         DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9251     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9252   }
9253   case Intrinsic::amdgcn_fmad_ftz:
9254     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9255                        Op.getOperand(2), Op.getOperand(3));
9256 
9257   case Intrinsic::amdgcn_if_break:
9258     return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9259                                       Op->getOperand(1), Op->getOperand(2)),
9260                    0);
9261 
9262   case Intrinsic::amdgcn_groupstaticsize: {
9263     Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
9264     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9265       return Op;
9266 
9267     const Module *M = MF.getFunction().getParent();
9268     const GlobalValue *GV =
9269         Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9270     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9271                                             SIInstrInfo::MO_ABS32_LO);
9272     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9273   }
9274   case Intrinsic::amdgcn_is_shared:
9275   case Intrinsic::amdgcn_is_private: {
9276     SDLoc SL(Op);
9277     unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9278                       ? AMDGPUAS::LOCAL_ADDRESS
9279                       : AMDGPUAS::PRIVATE_ADDRESS;
9280     SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9281     SDValue SrcVec =
9282         DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9283 
9284     SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9285                                 DAG.getConstant(1, SL, MVT::i32));
9286     return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9287   }
9288   case Intrinsic::amdgcn_perm:
9289     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9290                        Op.getOperand(2), Op.getOperand(3));
9291   case Intrinsic::amdgcn_reloc_constant: {
9292     Module *M = const_cast<Module *>(MF.getFunction().getParent());
9293     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9294     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9295     auto *RelocSymbol = cast<GlobalVariable>(
9296         M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9297     SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9298                                             SIInstrInfo::MO_ABS32_LO);
9299     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9300   }
9301   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9302   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9303   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9304   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9305   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9306   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9307   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9308   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9309     if (Op.getOperand(4).getValueType() == MVT::i32)
9310       return SDValue();
9311 
9312     SDLoc SL(Op);
9313     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9314     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9315                        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9316                        Op.getOperand(3), IndexKeyi32);
9317   }
9318   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9319   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9320   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9321     if (Op.getOperand(6).getValueType() == MVT::i32)
9322       return SDValue();
9323 
9324     SDLoc SL(Op);
9325     auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9326     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9327                        {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9328                         Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9329                         IndexKeyi32, Op.getOperand(7)});
9330   }
9331   case Intrinsic::amdgcn_addrspacecast_nonnull:
9332     return lowerADDRSPACECAST(Op, DAG);
9333   case Intrinsic::amdgcn_readlane:
9334   case Intrinsic::amdgcn_readfirstlane:
9335   case Intrinsic::amdgcn_writelane:
9336   case Intrinsic::amdgcn_permlane16:
9337   case Intrinsic::amdgcn_permlanex16:
9338   case Intrinsic::amdgcn_permlane64:
9339   case Intrinsic::amdgcn_set_inactive:
9340   case Intrinsic::amdgcn_set_inactive_chain_arg:
9341   case Intrinsic::amdgcn_mov_dpp8:
9342   case Intrinsic::amdgcn_update_dpp:
9343     return lowerLaneOp(*this, Op.getNode(), DAG);
9344   case Intrinsic::amdgcn_dead: {
9345     SmallVector<SDValue, 8> Poisons;
9346     for (const EVT ValTy : Op.getNode()->values())
9347       Poisons.push_back(DAG.getPOISON(ValTy));
9348     return DAG.getMergeValues(Poisons, SDLoc(Op));
9349   }
9350   default:
9351     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9352             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
9353       return lowerImage(Op, ImageDimIntr, DAG, false);
9354 
9355     return Op;
9356   }
9357 }
9358 
9359 // On targets not supporting constant in soffset field, turn zero to
9360 // SGPR_NULL to avoid generating an extra s_mov with zero.
9361 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
9362                              const GCNSubtarget *Subtarget) {
9363   if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9364     return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9365   return SOffset;
9366 }
9367 
9368 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9369                                                      SelectionDAG &DAG,
9370                                                      unsigned NewOpcode) const {
9371   SDLoc DL(Op);
9372 
9373   SDValue VData = Op.getOperand(2);
9374   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9375   auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9376   auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9377   SDValue Ops[] = {
9378       Op.getOperand(0),                      // Chain
9379       VData,                                 // vdata
9380       Rsrc,                                  // rsrc
9381       DAG.getConstant(0, DL, MVT::i32),      // vindex
9382       VOffset,                               // voffset
9383       SOffset,                               // soffset
9384       Offset,                                // offset
9385       Op.getOperand(6),                      // cachepolicy
9386       DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9387   };
9388 
9389   auto *M = cast<MemSDNode>(Op);
9390 
9391   EVT MemVT = VData.getValueType();
9392   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9393                                  M->getMemOperand());
9394 }
9395 
9396 SDValue
9397 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9398                                                 unsigned NewOpcode) const {
9399   SDLoc DL(Op);
9400 
9401   SDValue VData = Op.getOperand(2);
9402   SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9403   auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9404   auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9405   SDValue Ops[] = {
9406       Op.getOperand(0),                      // Chain
9407       VData,                                 // vdata
9408       Rsrc,                                  // rsrc
9409       Op.getOperand(4),                      // vindex
9410       VOffset,                               // voffset
9411       SOffset,                               // soffset
9412       Offset,                                // offset
9413       Op.getOperand(7),                      // cachepolicy
9414       DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9415   };
9416 
9417   auto *M = cast<MemSDNode>(Op);
9418 
9419   EVT MemVT = VData.getValueType();
9420   return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9421                                  M->getMemOperand());
9422 }
9423 
9424 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9425                                                  SelectionDAG &DAG) const {
9426   unsigned IntrID = Op.getConstantOperandVal(1);
9427   SDLoc DL(Op);
9428 
9429   switch (IntrID) {
9430   case Intrinsic::amdgcn_ds_ordered_add:
9431   case Intrinsic::amdgcn_ds_ordered_swap: {
9432     MemSDNode *M = cast<MemSDNode>(Op);
9433     SDValue Chain = M->getOperand(0);
9434     SDValue M0 = M->getOperand(2);
9435     SDValue Value = M->getOperand(3);
9436     unsigned IndexOperand = M->getConstantOperandVal(7);
9437     unsigned WaveRelease = M->getConstantOperandVal(8);
9438     unsigned WaveDone = M->getConstantOperandVal(9);
9439 
9440     unsigned OrderedCountIndex = IndexOperand & 0x3f;
9441     IndexOperand &= ~0x3f;
9442     unsigned CountDw = 0;
9443 
9444     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9445       CountDw = (IndexOperand >> 24) & 0xf;
9446       IndexOperand &= ~(0xf << 24);
9447 
9448       if (CountDw < 1 || CountDw > 4) {
9449         const Function &Fn = DAG.getMachineFunction().getFunction();
9450         DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9451             Fn, "ds_ordered_count: dword count must be between 1 and 4",
9452             DL.getDebugLoc()));
9453         CountDw = 1;
9454       }
9455     }
9456 
9457     if (IndexOperand) {
9458       const Function &Fn = DAG.getMachineFunction().getFunction();
9459       DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9460           Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9461     }
9462 
9463     if (WaveDone && !WaveRelease) {
9464       // TODO: Move this to IR verifier
9465       const Function &Fn = DAG.getMachineFunction().getFunction();
9466       DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9467           Fn, "ds_ordered_count: wave_done requires wave_release",
9468           DL.getDebugLoc()));
9469     }
9470 
9471     unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9472     unsigned ShaderType =
9473         SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
9474     unsigned Offset0 = OrderedCountIndex << 2;
9475     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9476 
9477     if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9478       Offset1 |= (CountDw - 1) << 6;
9479 
9480     if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9481       Offset1 |= ShaderType << 2;
9482 
9483     unsigned Offset = Offset0 | (Offset1 << 8);
9484 
9485     SDValue Ops[] = {
9486         Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9487         copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9488     };
9489     return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
9490                                    M->getVTList(), Ops, M->getMemoryVT(),
9491                                    M->getMemOperand());
9492   }
9493   case Intrinsic::amdgcn_raw_buffer_load:
9494   case Intrinsic::amdgcn_raw_ptr_buffer_load:
9495   case Intrinsic::amdgcn_raw_atomic_buffer_load:
9496   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9497   case Intrinsic::amdgcn_raw_buffer_load_format:
9498   case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9499     const bool IsFormat =
9500         IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9501         IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9502 
9503     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9504     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9505     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9506     SDValue Ops[] = {
9507         Op.getOperand(0),                      // Chain
9508         Rsrc,                                  // rsrc
9509         DAG.getConstant(0, DL, MVT::i32),      // vindex
9510         VOffset,                               // voffset
9511         SOffset,                               // soffset
9512         Offset,                                // offset
9513         Op.getOperand(5),                      // cachepolicy, swizzled buffer
9514         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9515     };
9516 
9517     auto *M = cast<MemSDNode>(Op);
9518     return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9519   }
9520   case Intrinsic::amdgcn_struct_buffer_load:
9521   case Intrinsic::amdgcn_struct_ptr_buffer_load:
9522   case Intrinsic::amdgcn_struct_buffer_load_format:
9523   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9524   case Intrinsic::amdgcn_struct_atomic_buffer_load:
9525   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9526     const bool IsFormat =
9527         IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9528         IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9529 
9530     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9531     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9532     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9533     SDValue Ops[] = {
9534         Op.getOperand(0),                      // Chain
9535         Rsrc,                                  // rsrc
9536         Op.getOperand(3),                      // vindex
9537         VOffset,                               // voffset
9538         SOffset,                               // soffset
9539         Offset,                                // offset
9540         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9541         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9542     };
9543 
9544     return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9545   }
9546   case Intrinsic::amdgcn_raw_tbuffer_load:
9547   case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9548     MemSDNode *M = cast<MemSDNode>(Op);
9549     EVT LoadVT = Op.getValueType();
9550     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9551     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9552     auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9553 
9554     SDValue Ops[] = {
9555         Op.getOperand(0),                      // Chain
9556         Rsrc,                                  // rsrc
9557         DAG.getConstant(0, DL, MVT::i32),      // vindex
9558         VOffset,                               // voffset
9559         SOffset,                               // soffset
9560         Offset,                                // offset
9561         Op.getOperand(5),                      // format
9562         Op.getOperand(6),                      // cachepolicy, swizzled buffer
9563         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9564     };
9565 
9566     if (LoadVT.getScalarType() == MVT::f16)
9567       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9568                                  Ops);
9569     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9570                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9571                                DAG);
9572   }
9573   case Intrinsic::amdgcn_struct_tbuffer_load:
9574   case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9575     MemSDNode *M = cast<MemSDNode>(Op);
9576     EVT LoadVT = Op.getValueType();
9577     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9578     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9579     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9580 
9581     SDValue Ops[] = {
9582         Op.getOperand(0),                      // Chain
9583         Rsrc,                                  // rsrc
9584         Op.getOperand(3),                      // vindex
9585         VOffset,                               // voffset
9586         SOffset,                               // soffset
9587         Offset,                                // offset
9588         Op.getOperand(6),                      // format
9589         Op.getOperand(7),                      // cachepolicy, swizzled buffer
9590         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9591     };
9592 
9593     if (LoadVT.getScalarType() == MVT::f16)
9594       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9595                                  Ops);
9596     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9597                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9598                                DAG);
9599   }
9600   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9601   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9602     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9603   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9604   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9605     return lowerStructBufferAtomicIntrin(Op, DAG,
9606                                          AMDGPUISD::BUFFER_ATOMIC_FADD);
9607   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9608   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9609     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9610   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9611   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9612     return lowerStructBufferAtomicIntrin(Op, DAG,
9613                                          AMDGPUISD::BUFFER_ATOMIC_FMIN);
9614   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9615   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9616     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9617   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9618   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9619     return lowerStructBufferAtomicIntrin(Op, DAG,
9620                                          AMDGPUISD::BUFFER_ATOMIC_FMAX);
9621   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9622   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9623     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9624   case Intrinsic::amdgcn_raw_buffer_atomic_add:
9625   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9626     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9627   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9628   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9629     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9630   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9631   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9632     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9633   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9634   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9635     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9636   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9637   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9638     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9639   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9640   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9641     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9642   case Intrinsic::amdgcn_raw_buffer_atomic_and:
9643   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9644     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9645   case Intrinsic::amdgcn_raw_buffer_atomic_or:
9646   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9647     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9648   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9649   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9650     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9651   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9652   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9653     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9654   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9655   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9656     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9657   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9658     return lowerRawBufferAtomicIntrin(Op, DAG,
9659                                       AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9660   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9661   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9662     return lowerStructBufferAtomicIntrin(Op, DAG,
9663                                          AMDGPUISD::BUFFER_ATOMIC_SWAP);
9664   case Intrinsic::amdgcn_struct_buffer_atomic_add:
9665   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9666     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9667   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9668   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9669     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9670   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9671   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9672     return lowerStructBufferAtomicIntrin(Op, DAG,
9673                                          AMDGPUISD::BUFFER_ATOMIC_SMIN);
9674   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9675   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9676     return lowerStructBufferAtomicIntrin(Op, DAG,
9677                                          AMDGPUISD::BUFFER_ATOMIC_UMIN);
9678   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9679   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9680     return lowerStructBufferAtomicIntrin(Op, DAG,
9681                                          AMDGPUISD::BUFFER_ATOMIC_SMAX);
9682   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9683   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9684     return lowerStructBufferAtomicIntrin(Op, DAG,
9685                                          AMDGPUISD::BUFFER_ATOMIC_UMAX);
9686   case Intrinsic::amdgcn_struct_buffer_atomic_and:
9687   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9688     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9689   case Intrinsic::amdgcn_struct_buffer_atomic_or:
9690   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9691     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9692   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9693   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9694     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9695   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9696   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9697     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9698   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9699   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9700     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9701   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9702     return lowerStructBufferAtomicIntrin(Op, DAG,
9703                                          AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9704 
9705   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9706   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9707     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9708     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9709     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9710     SDValue Ops[] = {
9711         Op.getOperand(0),                      // Chain
9712         Op.getOperand(2),                      // src
9713         Op.getOperand(3),                      // cmp
9714         Rsrc,                                  // rsrc
9715         DAG.getConstant(0, DL, MVT::i32),      // vindex
9716         VOffset,                               // voffset
9717         SOffset,                               // soffset
9718         Offset,                                // offset
9719         Op.getOperand(7),                      // cachepolicy
9720         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9721     };
9722     EVT VT = Op.getValueType();
9723     auto *M = cast<MemSDNode>(Op);
9724 
9725     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9726                                    Op->getVTList(), Ops, VT,
9727                                    M->getMemOperand());
9728   }
9729   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9730   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9731     SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9732     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
9733     auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9734     SDValue Ops[] = {
9735         Op.getOperand(0),                      // Chain
9736         Op.getOperand(2),                      // src
9737         Op.getOperand(3),                      // cmp
9738         Rsrc,                                  // rsrc
9739         Op.getOperand(5),                      // vindex
9740         VOffset,                               // voffset
9741         SOffset,                               // soffset
9742         Offset,                                // offset
9743         Op.getOperand(8),                      // cachepolicy
9744         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9745     };
9746     EVT VT = Op.getValueType();
9747     auto *M = cast<MemSDNode>(Op);
9748 
9749     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
9750                                    Op->getVTList(), Ops, VT,
9751                                    M->getMemOperand());
9752   }
9753   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
9754   case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
9755     MemSDNode *M = cast<MemSDNode>(Op);
9756     SDValue NodePtr = M->getOperand(2);
9757     SDValue RayExtent = M->getOperand(3);
9758     SDValue InstanceMask = M->getOperand(4);
9759     SDValue RayOrigin = M->getOperand(5);
9760     SDValue RayDir = M->getOperand(6);
9761     SDValue Offsets = M->getOperand(7);
9762     SDValue TDescr = M->getOperand(8);
9763 
9764     assert(NodePtr.getValueType() == MVT::i64);
9765     assert(RayDir.getValueType() == MVT::v3f32);
9766 
9767     if (!Subtarget->hasBVHDualAndBVH8Insts()) {
9768       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9769       return SDValue();
9770     }
9771 
9772     bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
9773     const unsigned NumVDataDwords = 10;
9774     const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
9775     int Opcode = AMDGPU::getMIMGOpcode(
9776         IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
9777                : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
9778         AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
9779     assert(Opcode != -1);
9780 
9781     SmallVector<SDValue, 7> Ops;
9782     Ops.push_back(NodePtr);
9783     Ops.push_back(DAG.getBuildVector(
9784         MVT::v2i32, DL,
9785         {DAG.getBitcast(MVT::i32, RayExtent),
9786          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
9787     Ops.push_back(RayOrigin);
9788     Ops.push_back(RayDir);
9789     Ops.push_back(Offsets);
9790     Ops.push_back(TDescr);
9791     Ops.push_back(M->getChain());
9792 
9793     auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9794     MachineMemOperand *MemRef = M->getMemOperand();
9795     DAG.setNodeMemRefs(NewNode, {MemRef});
9796     return SDValue(NewNode, 0);
9797   }
9798   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9799     MemSDNode *M = cast<MemSDNode>(Op);
9800     SDValue NodePtr = M->getOperand(2);
9801     SDValue RayExtent = M->getOperand(3);
9802     SDValue RayOrigin = M->getOperand(4);
9803     SDValue RayDir = M->getOperand(5);
9804     SDValue RayInvDir = M->getOperand(6);
9805     SDValue TDescr = M->getOperand(7);
9806 
9807     assert(NodePtr.getValueType() == MVT::i32 ||
9808            NodePtr.getValueType() == MVT::i64);
9809     assert(RayDir.getValueType() == MVT::v3f16 ||
9810            RayDir.getValueType() == MVT::v3f32);
9811 
9812     if (!Subtarget->hasGFX10_AEncoding()) {
9813       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9814       return SDValue();
9815     }
9816 
9817     const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9818     const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9819     const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9820     const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9821     const bool Is64 = NodePtr.getValueType() == MVT::i64;
9822     const unsigned NumVDataDwords = 4;
9823     const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9824     const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9825     const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9826                          NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9827                         IsGFX12Plus;
9828     const unsigned BaseOpcodes[2][2] = {
9829         {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9830         {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9831          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9832     int Opcode;
9833     if (UseNSA) {
9834       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9835                                      IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9836                                      : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
9837                                                  : AMDGPU::MIMGEncGfx10NSA,
9838                                      NumVDataDwords, NumVAddrDwords);
9839     } else {
9840       assert(!IsGFX12Plus);
9841       Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9842                                      IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9843                                              : AMDGPU::MIMGEncGfx10Default,
9844                                      NumVDataDwords, NumVAddrDwords);
9845     }
9846     assert(Opcode != -1);
9847 
9848     SmallVector<SDValue, 16> Ops;
9849 
9850     auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9851       SmallVector<SDValue, 3> Lanes;
9852       DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9853       if (Lanes[0].getValueSizeInBits() == 32) {
9854         for (unsigned I = 0; I < 3; ++I)
9855           Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9856       } else {
9857         if (IsAligned) {
9858           Ops.push_back(DAG.getBitcast(
9859               MVT::i32,
9860               DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
9861           Ops.push_back(Lanes[2]);
9862         } else {
9863           SDValue Elt0 = Ops.pop_back_val();
9864           Ops.push_back(DAG.getBitcast(
9865               MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
9866           Ops.push_back(DAG.getBitcast(
9867               MVT::i32,
9868               DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
9869         }
9870       }
9871     };
9872 
9873     if (UseNSA && IsGFX11Plus) {
9874       Ops.push_back(NodePtr);
9875       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9876       Ops.push_back(RayOrigin);
9877       if (IsA16) {
9878         SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9879         DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9880         DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9881         for (unsigned I = 0; I < 3; ++I) {
9882           MergedLanes.push_back(DAG.getBitcast(
9883               MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9884                                            {DirLanes[I], InvDirLanes[I]})));
9885         }
9886         Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9887       } else {
9888         Ops.push_back(RayDir);
9889         Ops.push_back(RayInvDir);
9890       }
9891     } else {
9892       if (Is64)
9893         DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9894                                   2);
9895       else
9896         Ops.push_back(NodePtr);
9897 
9898       Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9899       packLanes(RayOrigin, true);
9900       packLanes(RayDir, true);
9901       packLanes(RayInvDir, false);
9902     }
9903 
9904     if (!UseNSA) {
9905       // Build a single vector containing all the operands so far prepared.
9906       if (NumVAddrDwords > 12) {
9907         SDValue Undef = DAG.getPOISON(MVT::i32);
9908         Ops.append(16 - Ops.size(), Undef);
9909       }
9910       assert(Ops.size() >= 8 && Ops.size() <= 12);
9911       SDValue MergedOps =
9912           DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9913       Ops.clear();
9914       Ops.push_back(MergedOps);
9915     }
9916 
9917     Ops.push_back(TDescr);
9918     Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9919     Ops.push_back(M->getChain());
9920 
9921     auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9922     MachineMemOperand *MemRef = M->getMemOperand();
9923     DAG.setNodeMemRefs(NewNode, {MemRef});
9924     return SDValue(NewNode, 0);
9925   }
9926   case Intrinsic::amdgcn_global_atomic_fmin_num:
9927   case Intrinsic::amdgcn_global_atomic_fmax_num:
9928   case Intrinsic::amdgcn_flat_atomic_fmin_num:
9929   case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9930     MemSDNode *M = cast<MemSDNode>(Op);
9931     SDValue Ops[] = {
9932         M->getOperand(0), // Chain
9933         M->getOperand(2), // Ptr
9934         M->getOperand(3)  // Value
9935     };
9936     unsigned Opcode = 0;
9937     switch (IntrID) {
9938     case Intrinsic::amdgcn_global_atomic_fmin_num:
9939     case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9940       Opcode = ISD::ATOMIC_LOAD_FMIN;
9941       break;
9942     }
9943     case Intrinsic::amdgcn_global_atomic_fmax_num:
9944     case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9945       Opcode = ISD::ATOMIC_LOAD_FMAX;
9946       break;
9947     }
9948     default:
9949       llvm_unreachable("unhandled atomic opcode");
9950     }
9951     return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9952                          Ops, M->getMemOperand());
9953   }
9954   case Intrinsic::amdgcn_s_get_barrier_state:
9955   case Intrinsic::amdgcn_s_get_named_barrier_state: {
9956     SDValue Chain = Op->getOperand(0);
9957     SmallVector<SDValue, 2> Ops;
9958     unsigned Opc;
9959 
9960     if (isa<ConstantSDNode>(Op->getOperand(2))) {
9961       uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
9962       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9963         BarID = (BarID >> 4) & 0x3F;
9964       Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9965       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9966       Ops.push_back(K);
9967       Ops.push_back(Chain);
9968     } else {
9969       Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9970       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9971         SDValue M0Val;
9972         M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
9973                             DAG.getShiftAmountConstant(4, MVT::i32, DL));
9974         M0Val = SDValue(
9975             DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
9976                                DAG.getTargetConstant(0x3F, DL, MVT::i32)),
9977             0);
9978         Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9979       } else
9980         Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
9981     }
9982 
9983     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9984     return SDValue(NewMI, 0);
9985   }
9986   default:
9987 
9988     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9989             AMDGPU::getImageDimIntrinsicInfo(IntrID))
9990       return lowerImage(Op, ImageDimIntr, DAG, true);
9991 
9992     return SDValue();
9993   }
9994 }
9995 
9996 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9997 // dwordx4 if on SI and handle TFE loads.
9998 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9999                                               SDVTList VTList,
10000                                               ArrayRef<SDValue> Ops, EVT MemVT,
10001                                               MachineMemOperand *MMO,
10002                                               SelectionDAG &DAG) const {
10003   LLVMContext &C = *DAG.getContext();
10004   MachineFunction &MF = DAG.getMachineFunction();
10005   EVT VT = VTList.VTs[0];
10006 
10007   assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10008   bool IsTFE = VTList.NumVTs == 3;
10009   if (IsTFE) {
10010     unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10011     unsigned NumOpDWords = NumValueDWords + 1;
10012     EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10013     SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10014     MachineMemOperand *OpDWordsMMO =
10015         MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10016     SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10017                                      OpDWordsVT, OpDWordsMMO, DAG);
10018     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10019                                  DAG.getVectorIdxConstant(NumValueDWords, DL));
10020     SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10021     SDValue ValueDWords =
10022         NumValueDWords == 1
10023             ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10024             : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
10025                           EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10026                           ZeroIdx);
10027     SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10028     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10029   }
10030 
10031   if (!Subtarget->hasDwordx3LoadStores() &&
10032       (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10033     EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10034     EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10035     MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10036     SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10037     SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10038                                          WidenedMemVT, WidenedMMO);
10039     SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,
10040                                 DAG.getVectorIdxConstant(0, DL));
10041     return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10042   }
10043 
10044   return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10045 }
10046 
10047 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10048                                          bool ImageStore) const {
10049   EVT StoreVT = VData.getValueType();
10050 
10051   // No change for f16 and legal vector D16 types.
10052   if (!StoreVT.isVector())
10053     return VData;
10054 
10055   SDLoc DL(VData);
10056   unsigned NumElements = StoreVT.getVectorNumElements();
10057 
10058   if (Subtarget->hasUnpackedD16VMem()) {
10059     // We need to unpack the packed data to store.
10060     EVT IntStoreVT = StoreVT.changeTypeToInteger();
10061     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10062 
10063     EVT EquivStoreVT =
10064         EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10065     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10066     return DAG.UnrollVectorOp(ZExt.getNode());
10067   }
10068 
10069   // The sq block of gfx8.1 does not estimate register use correctly for d16
10070   // image store instructions. The data operand is computed as if it were not a
10071   // d16 image instruction.
10072   if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10073     // Bitcast to i16
10074     EVT IntStoreVT = StoreVT.changeTypeToInteger();
10075     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10076 
10077     // Decompose into scalars
10078     SmallVector<SDValue, 4> Elts;
10079     DAG.ExtractVectorElements(IntVData, Elts);
10080 
10081     // Group pairs of i16 into v2i16 and bitcast to i32
10082     SmallVector<SDValue, 4> PackedElts;
10083     for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10084       SDValue Pair =
10085           DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10086       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10087       PackedElts.push_back(IntPair);
10088     }
10089     if ((NumElements % 2) == 1) {
10090       // Handle v3i16
10091       unsigned I = Elts.size() / 2;
10092       SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10093                                         {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10094       SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10095       PackedElts.push_back(IntPair);
10096     }
10097 
10098     // Pad using UNDEF
10099     PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10100 
10101     // Build final vector
10102     EVT VecVT =
10103         EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10104     return DAG.getBuildVector(VecVT, DL, PackedElts);
10105   }
10106 
10107   if (NumElements == 3) {
10108     EVT IntStoreVT =
10109         EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
10110     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10111 
10112     EVT WidenedStoreVT = EVT::getVectorVT(
10113         *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10114     EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10115                                          WidenedStoreVT.getStoreSizeInBits());
10116     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10117     return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10118   }
10119 
10120   assert(isTypeLegal(StoreVT));
10121   return VData;
10122 }
10123 
10124 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10125                                               SelectionDAG &DAG) const {
10126   SDLoc DL(Op);
10127   SDValue Chain = Op.getOperand(0);
10128   unsigned IntrinsicID = Op.getConstantOperandVal(1);
10129   MachineFunction &MF = DAG.getMachineFunction();
10130 
10131   switch (IntrinsicID) {
10132   case Intrinsic::amdgcn_exp_compr: {
10133     if (!Subtarget->hasCompressedExport()) {
10134       DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10135           DAG.getMachineFunction().getFunction(),
10136           "intrinsic not supported on subtarget", DL.getDebugLoc()));
10137     }
10138     SDValue Src0 = Op.getOperand(4);
10139     SDValue Src1 = Op.getOperand(5);
10140     // Hack around illegal type on SI by directly selecting it.
10141     if (isTypeLegal(Src0.getValueType()))
10142       return SDValue();
10143 
10144     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10145     SDValue Undef = DAG.getPOISON(MVT::f32);
10146     const SDValue Ops[] = {
10147         Op.getOperand(2),                              // tgt
10148         DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10149         DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10150         Undef,                                         // src2
10151         Undef,                                         // src3
10152         Op.getOperand(7),                              // vm
10153         DAG.getTargetConstant(1, DL, MVT::i1),         // compr
10154         Op.getOperand(3),                              // en
10155         Op.getOperand(0)                               // Chain
10156     };
10157 
10158     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10159     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10160   }
10161   case Intrinsic::amdgcn_s_barrier:
10162   case Intrinsic::amdgcn_s_barrier_signal:
10163   case Intrinsic::amdgcn_s_barrier_wait: {
10164     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10165     if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
10166       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
10167       if (WGSize <= ST.getWavefrontSize()) {
10168         // If the workgroup fits in a wave, remove s_barrier_signal and lower
10169         // s_barrier/s_barrier_wait to wave_barrier.
10170         if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
10171           return Op.getOperand(0);
10172         else
10173           return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
10174                                             MVT::Other, Op.getOperand(0)),
10175                          0);
10176       }
10177     }
10178 
10179     if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
10180       // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
10181       SDValue K =
10182           DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
10183       SDValue BarSignal =
10184           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
10185                                      MVT::Other, K, Op.getOperand(0)),
10186                   0);
10187       SDValue BarWait =
10188           SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
10189                                      BarSignal.getValue(0)),
10190                   0);
10191       return BarWait;
10192     }
10193 
10194     return SDValue();
10195   };
10196 
10197   case Intrinsic::amdgcn_struct_tbuffer_store:
10198   case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10199     SDValue VData = Op.getOperand(2);
10200     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10201     if (IsD16)
10202       VData = handleD16VData(VData, DAG);
10203     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10204     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10205     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10206     SDValue Ops[] = {
10207         Chain,
10208         VData,                                 // vdata
10209         Rsrc,                                  // rsrc
10210         Op.getOperand(4),                      // vindex
10211         VOffset,                               // voffset
10212         SOffset,                               // soffset
10213         Offset,                                // offset
10214         Op.getOperand(7),                      // format
10215         Op.getOperand(8),                      // cachepolicy, swizzled buffer
10216         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10217     };
10218     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10219                          : AMDGPUISD::TBUFFER_STORE_FORMAT;
10220     MemSDNode *M = cast<MemSDNode>(Op);
10221     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10222                                    M->getMemoryVT(), M->getMemOperand());
10223   }
10224 
10225   case Intrinsic::amdgcn_raw_tbuffer_store:
10226   case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10227     SDValue VData = Op.getOperand(2);
10228     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10229     if (IsD16)
10230       VData = handleD16VData(VData, DAG);
10231     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10232     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10233     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10234     SDValue Ops[] = {
10235         Chain,
10236         VData,                                 // vdata
10237         Rsrc,                                  // rsrc
10238         DAG.getConstant(0, DL, MVT::i32),      // vindex
10239         VOffset,                               // voffset
10240         SOffset,                               // soffset
10241         Offset,                                // offset
10242         Op.getOperand(6),                      // format
10243         Op.getOperand(7),                      // cachepolicy, swizzled buffer
10244         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10245     };
10246     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10247                          : AMDGPUISD::TBUFFER_STORE_FORMAT;
10248     MemSDNode *M = cast<MemSDNode>(Op);
10249     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10250                                    M->getMemoryVT(), M->getMemOperand());
10251   }
10252 
10253   case Intrinsic::amdgcn_raw_buffer_store:
10254   case Intrinsic::amdgcn_raw_ptr_buffer_store:
10255   case Intrinsic::amdgcn_raw_buffer_store_format:
10256   case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10257     const bool IsFormat =
10258         IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10259         IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10260 
10261     SDValue VData = Op.getOperand(2);
10262     EVT VDataVT = VData.getValueType();
10263     EVT EltType = VDataVT.getScalarType();
10264     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10265     if (IsD16) {
10266       VData = handleD16VData(VData, DAG);
10267       VDataVT = VData.getValueType();
10268     }
10269 
10270     if (!isTypeLegal(VDataVT)) {
10271       VData =
10272           DAG.getNode(ISD::BITCAST, DL,
10273                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10274     }
10275 
10276     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10277     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10278     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10279     SDValue Ops[] = {
10280         Chain,
10281         VData,
10282         Rsrc,
10283         DAG.getConstant(0, DL, MVT::i32),      // vindex
10284         VOffset,                               // voffset
10285         SOffset,                               // soffset
10286         Offset,                                // offset
10287         Op.getOperand(6),                      // cachepolicy, swizzled buffer
10288         DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10289     };
10290     unsigned Opc =
10291         IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
10292     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10293     MemSDNode *M = cast<MemSDNode>(Op);
10294 
10295     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10296     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10297       return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10298 
10299     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10300                                    M->getMemoryVT(), M->getMemOperand());
10301   }
10302 
10303   case Intrinsic::amdgcn_struct_buffer_store:
10304   case Intrinsic::amdgcn_struct_ptr_buffer_store:
10305   case Intrinsic::amdgcn_struct_buffer_store_format:
10306   case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10307     const bool IsFormat =
10308         IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10309         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10310 
10311     SDValue VData = Op.getOperand(2);
10312     EVT VDataVT = VData.getValueType();
10313     EVT EltType = VDataVT.getScalarType();
10314     bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10315 
10316     if (IsD16) {
10317       VData = handleD16VData(VData, DAG);
10318       VDataVT = VData.getValueType();
10319     }
10320 
10321     if (!isTypeLegal(VDataVT)) {
10322       VData =
10323           DAG.getNode(ISD::BITCAST, DL,
10324                       getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10325     }
10326 
10327     auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10328     auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10329     auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10330     SDValue Ops[] = {
10331         Chain,
10332         VData,
10333         Rsrc,
10334         Op.getOperand(4),                      // vindex
10335         VOffset,                               // voffset
10336         SOffset,                               // soffset
10337         Offset,                                // offset
10338         Op.getOperand(7),                      // cachepolicy, swizzled buffer
10339         DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10340     };
10341     unsigned Opc =
10342         !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
10343     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10344     MemSDNode *M = cast<MemSDNode>(Op);
10345 
10346     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10347     EVT VDataType = VData.getValueType().getScalarType();
10348     if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10349       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10350 
10351     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10352                                    M->getMemoryVT(), M->getMemOperand());
10353   }
10354   case Intrinsic::amdgcn_raw_buffer_load_lds:
10355   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10356   case Intrinsic::amdgcn_struct_buffer_load_lds:
10357   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10358     if (!Subtarget->hasVMemToLDSLoad())
10359       return SDValue();
10360     unsigned Opc;
10361     bool HasVIndex =
10362         IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10363         IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10364     unsigned OpOffset = HasVIndex ? 1 : 0;
10365     SDValue VOffset = Op.getOperand(5 + OpOffset);
10366     bool HasVOffset = !isNullConstant(VOffset);
10367     unsigned Size = Op->getConstantOperandVal(4);
10368 
10369     switch (Size) {
10370     default:
10371       return SDValue();
10372     case 1:
10373       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10374                                       : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10375             : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10376                          : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10377       break;
10378     case 2:
10379       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10380                                       : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10381             : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10382                          : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10383       break;
10384     case 4:
10385       Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10386                                       : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10387             : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10388                          : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10389       break;
10390     case 12:
10391       if (!Subtarget->hasLDSLoadB96_B128())
10392         return SDValue();
10393       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10394                                    : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10395                       : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10396                                    : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10397       break;
10398     case 16:
10399       if (!Subtarget->hasLDSLoadB96_B128())
10400         return SDValue();
10401       Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10402                                    : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10403                       : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10404                                    : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10405       break;
10406     }
10407 
10408     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10409 
10410     SmallVector<SDValue, 8> Ops;
10411 
10412     if (HasVIndex && HasVOffset)
10413       Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10414                                        {Op.getOperand(5), // VIndex
10415                                         VOffset}));
10416     else if (HasVIndex)
10417       Ops.push_back(Op.getOperand(5));
10418     else if (HasVOffset)
10419       Ops.push_back(VOffset);
10420 
10421     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10422     Ops.push_back(Rsrc);
10423     Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10424     Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10425     bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10426     unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10427     Ops.push_back(DAG.getTargetConstant(
10428         Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10429         DL, MVT::i8)); // cpol
10430     Ops.push_back(DAG.getTargetConstant(
10431         Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10432             ? 1
10433             : 0,
10434         DL, MVT::i8));                                           // swz
10435     Ops.push_back(M0Val.getValue(0));                            // Chain
10436     Ops.push_back(M0Val.getValue(1));                            // Glue
10437 
10438     auto *M = cast<MemSDNode>(Op);
10439     MachineMemOperand *LoadMMO = M->getMemOperand();
10440     // Don't set the offset value here because the pointer points to the base of
10441     // the buffer.
10442     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10443 
10444     MachinePointerInfo StorePtrI = LoadPtrI;
10445     LoadPtrI.V = PoisonValue::get(
10446         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10447     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10448     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10449 
10450     auto F = LoadMMO->getFlags() &
10451              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10452     LoadMMO =
10453         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10454                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10455 
10456     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10457         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10458         LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10459 
10460     auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10461     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10462 
10463     return SDValue(Load, 0);
10464   }
10465   // Buffers are handled by LowerBufferFatPointers, and we're going to go
10466   // for "trust me" that the remaining cases are global pointers until
10467   // such time as we can put two mem operands on an intrinsic.
10468   case Intrinsic::amdgcn_load_to_lds:
10469   case Intrinsic::amdgcn_global_load_lds: {
10470     if (!Subtarget->hasVMemToLDSLoad())
10471       return SDValue();
10472 
10473     unsigned Opc;
10474     unsigned Size = Op->getConstantOperandVal(4);
10475     switch (Size) {
10476     default:
10477       return SDValue();
10478     case 1:
10479       Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10480       break;
10481     case 2:
10482       Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10483       break;
10484     case 4:
10485       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10486       break;
10487     case 12:
10488       if (!Subtarget->hasLDSLoadB96_B128())
10489         return SDValue();
10490       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10491       break;
10492     case 16:
10493       if (!Subtarget->hasLDSLoadB96_B128())
10494         return SDValue();
10495       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10496       break;
10497     }
10498 
10499     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10500 
10501     SmallVector<SDValue, 6> Ops;
10502 
10503     SDValue Addr = Op.getOperand(2); // Global ptr
10504     SDValue VOffset;
10505     // Try to split SAddr and VOffset. Global and LDS pointers share the same
10506     // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10507     if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10508       SDValue LHS = Addr.getOperand(0);
10509       SDValue RHS = Addr.getOperand(1);
10510 
10511       if (LHS->isDivergent())
10512         std::swap(LHS, RHS);
10513 
10514       if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10515           RHS.getOperand(0).getValueType() == MVT::i32) {
10516         // add (i64 sgpr), (zero_extend (i32 vgpr))
10517         Addr = LHS;
10518         VOffset = RHS.getOperand(0);
10519       }
10520     }
10521 
10522     Ops.push_back(Addr);
10523     if (!Addr->isDivergent()) {
10524       Opc = AMDGPU::getGlobalSaddrOp(Opc);
10525       if (!VOffset)
10526         VOffset =
10527             SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10528                                        DAG.getTargetConstant(0, DL, MVT::i32)),
10529                     0);
10530       Ops.push_back(VOffset);
10531     }
10532 
10533     Ops.push_back(Op.getOperand(5));  // Offset
10534     Ops.push_back(Op.getOperand(6));  // CPol
10535     Ops.push_back(M0Val.getValue(0)); // Chain
10536     Ops.push_back(M0Val.getValue(1)); // Glue
10537 
10538     auto *M = cast<MemSDNode>(Op);
10539     MachineMemOperand *LoadMMO = M->getMemOperand();
10540     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10541     LoadPtrI.Offset = Op->getConstantOperandVal(5);
10542     MachinePointerInfo StorePtrI = LoadPtrI;
10543     LoadPtrI.V = PoisonValue::get(
10544         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
10545     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10546     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10547     auto F = LoadMMO->getFlags() &
10548              ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10549     LoadMMO =
10550         MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
10551                                 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10552     MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10553         StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10554         LoadMMO->getAAInfo());
10555 
10556     auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10557     DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10558 
10559     return SDValue(Load, 0);
10560   }
10561   case Intrinsic::amdgcn_end_cf:
10562     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10563                                       Op->getOperand(2), Chain),
10564                    0);
10565   case Intrinsic::amdgcn_s_barrier_signal_var: {
10566     // these two intrinsics have two operands: barrier pointer and member count
10567     SDValue Chain = Op->getOperand(0);
10568     SmallVector<SDValue, 2> Ops;
10569     SDValue BarOp = Op->getOperand(2);
10570     SDValue CntOp = Op->getOperand(3);
10571     SDValue M0Val;
10572     // extract the BarrierID from bits 4-9 of BarOp
10573     SDValue BarID;
10574     BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10575                         DAG.getShiftAmountConstant(4, MVT::i32, DL));
10576     BarID =
10577         SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10578                                    DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10579                 0);
10580     // Member count should be put into M0[ShAmt:+6]
10581     // Barrier ID should be put into M0[5:0]
10582     M0Val =
10583         SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10584                                    DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10585                 0);
10586     constexpr unsigned ShAmt = 16;
10587     M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10588                         DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10589 
10590     M0Val = SDValue(
10591         DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10592 
10593     Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10594 
10595     auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
10596                                      Op->getVTList(), Ops);
10597     return SDValue(NewMI, 0);
10598   }
10599   case Intrinsic::amdgcn_s_prefetch_data: {
10600     // For non-global address space preserve the chain and remove the call.
10601     if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
10602       return Op.getOperand(0);
10603     return Op;
10604   }
10605   case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10606     SDValue Ops[] = {
10607         Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10608         Op.getOperand(3), // offset
10609         Op.getOperand(4), // length
10610     };
10611 
10612     MemSDNode *M = cast<MemSDNode>(Op);
10613     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
10614                                    Op->getVTList(), Ops, M->getMemoryVT(),
10615                                    M->getMemOperand());
10616   }
10617   default: {
10618     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10619             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
10620       return lowerImage(Op, ImageDimIntr, DAG, true);
10621 
10622     return Op;
10623   }
10624   }
10625 }
10626 
10627 bool SITargetLowering::shouldPreservePtrArith(const Function &F,
10628                                               EVT PtrVT) const {
10629   return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10630 }
10631 
10632 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10633 // offset (the offset that is included in bounds checking and swizzling, to be
10634 // split between the instruction's voffset and immoffset fields) and soffset
10635 // (the offset that is excluded from bounds checking and swizzling, to go in
10636 // the instruction's soffset field).  This function takes the first kind of
10637 // offset and figures out how to split it between voffset and immoffset.
10638 std::pair<SDValue, SDValue>
10639 SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10640   SDLoc DL(Offset);
10641   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10642   SDValue N0 = Offset;
10643   ConstantSDNode *C1 = nullptr;
10644 
10645   if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10646     N0 = SDValue();
10647   else if (DAG.isBaseWithConstantOffset(N0)) {
10648     C1 = cast<ConstantSDNode>(N0.getOperand(1));
10649     N0 = N0.getOperand(0);
10650   }
10651 
10652   if (C1) {
10653     unsigned ImmOffset = C1->getZExtValue();
10654     // If the immediate value is too big for the immoffset field, put only bits
10655     // that would normally fit in the immoffset field. The remaining value that
10656     // is copied/added for the voffset field is a large power of 2, and it
10657     // stands more chance of being CSEd with the copy/add for another similar
10658     // load/store.
10659     // However, do not do that rounding down if that is a negative
10660     // number, as it appears to be illegal to have a negative offset in the
10661     // vgpr, even if adding the immediate offset makes it positive.
10662     unsigned Overflow = ImmOffset & ~MaxImm;
10663     ImmOffset -= Overflow;
10664     if ((int32_t)Overflow < 0) {
10665       Overflow += ImmOffset;
10666       ImmOffset = 0;
10667     }
10668     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10669     if (Overflow) {
10670       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10671       if (!N0)
10672         N0 = OverflowVal;
10673       else {
10674         SDValue Ops[] = {N0, OverflowVal};
10675         N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10676       }
10677     }
10678   }
10679   if (!N0)
10680     N0 = DAG.getConstant(0, DL, MVT::i32);
10681   if (!C1)
10682     C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10683   return {N0, SDValue(C1, 0)};
10684 }
10685 
10686 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10687 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10688 // pointed to by Offsets.
10689 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10690                                         SelectionDAG &DAG, SDValue *Offsets,
10691                                         Align Alignment) const {
10692   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10693   SDLoc DL(CombinedOffset);
10694   if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10695     uint32_t Imm = C->getZExtValue();
10696     uint32_t SOffset, ImmOffset;
10697     if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10698       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10699       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10700       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10701       return;
10702     }
10703   }
10704   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10705     SDValue N0 = CombinedOffset.getOperand(0);
10706     SDValue N1 = CombinedOffset.getOperand(1);
10707     uint32_t SOffset, ImmOffset;
10708     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10709     if (Offset >= 0 &&
10710         TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10711       Offsets[0] = N0;
10712       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10713       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10714       return;
10715     }
10716   }
10717 
10718   SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10719                             ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10720                             : DAG.getConstant(0, DL, MVT::i32);
10721 
10722   Offsets[0] = CombinedOffset;
10723   Offsets[1] = SOffsetZero;
10724   Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10725 }
10726 
10727 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10728                                                 SelectionDAG &DAG) const {
10729   if (!MaybePointer.getValueType().isScalarInteger())
10730     return MaybePointer;
10731 
10732   SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10733   return Rsrc;
10734 }
10735 
10736 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10737 // specified in the intrinsic.
10738 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10739                                                    SelectionDAG &DAG) const {
10740   SDLoc Loc(Op);
10741 
10742   SDValue Pointer = Op->getOperand(1);
10743   SDValue Stride = Op->getOperand(2);
10744   SDValue NumRecords = Op->getOperand(3);
10745   SDValue Flags = Op->getOperand(4);
10746 
10747   auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10748   SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10749   SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10750   std::optional<uint32_t> ConstStride = std::nullopt;
10751   if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10752     ConstStride = ConstNode->getZExtValue();
10753 
10754   SDValue NewHighHalf = Masked;
10755   if (!ConstStride || *ConstStride != 0) {
10756     SDValue ShiftedStride;
10757     if (ConstStride) {
10758       ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10759     } else {
10760       SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10761       ShiftedStride =
10762           DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10763                       DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10764     }
10765     NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10766   }
10767 
10768   SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10769                              NewHighHalf, NumRecords, Flags);
10770   SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10771   return RsrcPtr;
10772 }
10773 
10774 // Handle 8 bit and 16 bit buffer loads
10775 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10776                                                      EVT LoadVT, SDLoc DL,
10777                                                      ArrayRef<SDValue> Ops,
10778                                                      MachineMemOperand *MMO,
10779                                                      bool IsTFE) const {
10780   EVT IntVT = LoadVT.changeTypeToInteger();
10781 
10782   if (IsTFE) {
10783     unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10784                        ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10785                        : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10786     MachineFunction &MF = DAG.getMachineFunction();
10787     MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10788     SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10789     SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10790     SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10791                                  DAG.getConstant(1, DL, MVT::i32));
10792     SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10793                                DAG.getConstant(0, DL, MVT::i32));
10794     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10795     SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10796     return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10797   }
10798 
10799   unsigned Opc = LoadVT.getScalarType() == MVT::i8
10800                      ? AMDGPUISD::BUFFER_LOAD_UBYTE
10801                      : AMDGPUISD::BUFFER_LOAD_USHORT;
10802 
10803   SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10804   SDValue BufferLoad =
10805       DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10806   SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10807   LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10808 
10809   return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10810 }
10811 
10812 // Handle 8 bit and 16 bit buffer stores
10813 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10814                                                       EVT VDataType, SDLoc DL,
10815                                                       SDValue Ops[],
10816                                                       MemSDNode *M) const {
10817   if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10818     Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10819 
10820   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10821   Ops[1] = BufferStoreExt;
10822   unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10823                                         : AMDGPUISD::BUFFER_STORE_SHORT;
10824   ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10825   return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10826                                  M->getMemOperand());
10827 }
10828 
10829 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
10830                                  SDValue Op, const SDLoc &SL, EVT VT) {
10831   if (VT.bitsLT(Op.getValueType()))
10832     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10833 
10834   switch (ExtType) {
10835   case ISD::SEXTLOAD:
10836     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10837   case ISD::ZEXTLOAD:
10838     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10839   case ISD::EXTLOAD:
10840     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10841   case ISD::NON_EXTLOAD:
10842     return Op;
10843   }
10844 
10845   llvm_unreachable("invalid ext type");
10846 }
10847 
10848 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10849 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10850 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10851                                     DAGCombinerInfo &DCI) const {
10852   SelectionDAG &DAG = DCI.DAG;
10853   if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10854     return SDValue();
10855 
10856   // FIXME: Constant loads should all be marked invariant.
10857   unsigned AS = Ld->getAddressSpace();
10858   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10859       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10860       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10861     return SDValue();
10862 
10863   // Don't do this early, since it may interfere with adjacent load merging for
10864   // illegal types. We can avoid losing alignment information for exotic types
10865   // pre-legalize.
10866   EVT MemVT = Ld->getMemoryVT();
10867   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10868       MemVT.getSizeInBits() >= 32)
10869     return SDValue();
10870 
10871   SDLoc SL(Ld);
10872 
10873   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10874          "unexpected vector extload");
10875 
10876   // TODO: Drop only high part of range.
10877   SDValue Ptr = Ld->getBasePtr();
10878   SDValue NewLoad = DAG.getLoad(
10879       ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10880       Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10881       Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10882       nullptr); // Drop ranges
10883 
10884   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10885   if (MemVT.isFloatingPoint()) {
10886     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10887            "unexpected fp extload");
10888     TruncVT = MemVT.changeTypeToInteger();
10889   }
10890 
10891   SDValue Cvt = NewLoad;
10892   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10893     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10894                       DAG.getValueType(TruncVT));
10895   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10896              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10897     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10898   } else {
10899     assert(Ld->getExtensionType() == ISD::EXTLOAD);
10900   }
10901 
10902   EVT VT = Ld->getValueType(0);
10903   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10904 
10905   DCI.AddToWorklist(Cvt.getNode());
10906 
10907   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10908   // the appropriate extension from the 32-bit load.
10909   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10910   DCI.AddToWorklist(Cvt.getNode());
10911 
10912   // Handle conversion back to floating point if necessary.
10913   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10914 
10915   return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
10916 }
10917 
10918 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10919                                           const SIMachineFunctionInfo &Info) {
10920   // TODO: Should check if the address can definitely not access stack.
10921   if (Info.isEntryFunction())
10922     return Info.getUserSGPRInfo().hasFlatScratchInit();
10923   return true;
10924 }
10925 
10926 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10927   SDLoc DL(Op);
10928   LoadSDNode *Load = cast<LoadSDNode>(Op);
10929   ISD::LoadExtType ExtType = Load->getExtensionType();
10930   EVT MemVT = Load->getMemoryVT();
10931   MachineMemOperand *MMO = Load->getMemOperand();
10932 
10933   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10934     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10935       return SDValue();
10936 
10937     // FIXME: Copied from PPC
10938     // First, load into 32 bits, then truncate to 1 bit.
10939 
10940     SDValue Chain = Load->getChain();
10941     SDValue BasePtr = Load->getBasePtr();
10942 
10943     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10944 
10945     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
10946                                    RealMemVT, MMO);
10947 
10948     if (!MemVT.isVector()) {
10949       SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10950                        NewLD.getValue(1)};
10951 
10952       return DAG.getMergeValues(Ops, DL);
10953     }
10954 
10955     SmallVector<SDValue, 3> Elts;
10956     for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10957       SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10958                                 DAG.getConstant(I, DL, MVT::i32));
10959 
10960       Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10961     }
10962 
10963     SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
10964 
10965     return DAG.getMergeValues(Ops, DL);
10966   }
10967 
10968   if (!MemVT.isVector())
10969     return SDValue();
10970 
10971   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10972          "Custom lowering for non-i32 vectors hasn't been implemented.");
10973 
10974   Align Alignment = Load->getAlign();
10975   unsigned AS = Load->getAddressSpace();
10976   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10977       Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10978     return SplitVectorLoad(Op, DAG);
10979   }
10980 
10981   MachineFunction &MF = DAG.getMachineFunction();
10982   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10983   // If there is a possibility that flat instruction access scratch memory
10984   // then we need to use the same legalization rules we use for private.
10985   if (AS == AMDGPUAS::FLAT_ADDRESS &&
10986       !Subtarget->hasMultiDwordFlatScratchAddressing())
10987     AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
10988              ? AMDGPUAS::PRIVATE_ADDRESS
10989              : AMDGPUAS::GLOBAL_ADDRESS;
10990 
10991   unsigned NumElements = MemVT.getVectorNumElements();
10992 
10993   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10994       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10995       (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10996        Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10997        isMemOpHasNoClobberedMemOperand(Load))) {
10998     if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
10999         Alignment >= Align(4) && NumElements < 32) {
11000       if (MemVT.isPow2VectorType() ||
11001           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11002         return SDValue();
11003       return WidenOrSplitVectorLoad(Op, DAG);
11004     }
11005     // Non-uniform loads will be selected to MUBUF instructions, so they
11006     // have the same legalization requirements as global and private
11007     // loads.
11008     //
11009   }
11010   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11011       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
11012       AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11013     if (NumElements > 4)
11014       return SplitVectorLoad(Op, DAG);
11015     // v3 loads not supported on SI.
11016     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11017       return WidenOrSplitVectorLoad(Op, DAG);
11018 
11019     // v3 and v4 loads are supported for private and global memory.
11020     return SDValue();
11021   }
11022   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11023     // Depending on the setting of the private_element_size field in the
11024     // resource descriptor, we can only make private accesses up to a certain
11025     // size.
11026     switch (Subtarget->getMaxPrivateElementSize()) {
11027     case 4: {
11028       auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11029       return DAG.getMergeValues({Op0, Op1}, DL);
11030     }
11031     case 8:
11032       if (NumElements > 2)
11033         return SplitVectorLoad(Op, DAG);
11034       return SDValue();
11035     case 16:
11036       // Same as global/flat
11037       if (NumElements > 4)
11038         return SplitVectorLoad(Op, DAG);
11039       // v3 loads not supported on SI.
11040       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11041         return WidenOrSplitVectorLoad(Op, DAG);
11042 
11043       return SDValue();
11044     default:
11045       llvm_unreachable("unsupported private_element_size");
11046     }
11047   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11048     unsigned Fast = 0;
11049     auto Flags = Load->getMemOperand()->getFlags();
11050     if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
11051                                            Load->getAlign(), Flags, &Fast) &&
11052         Fast > 1)
11053       return SDValue();
11054 
11055     if (MemVT.isVector())
11056       return SplitVectorLoad(Op, DAG);
11057   }
11058 
11059   if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
11060                                       MemVT, *Load->getMemOperand())) {
11061     auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11062     return DAG.getMergeValues({Op0, Op1}, DL);
11063   }
11064 
11065   return SDValue();
11066 }
11067 
11068 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11069   EVT VT = Op.getValueType();
11070   if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11071       VT.getSizeInBits() == 512)
11072     return splitTernaryVectorOp(Op, DAG);
11073 
11074   assert(VT.getSizeInBits() == 64);
11075 
11076   SDLoc DL(Op);
11077   SDValue Cond = Op.getOperand(0);
11078 
11079   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11080   SDValue One = DAG.getConstant(1, DL, MVT::i32);
11081 
11082   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11083   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11084 
11085   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11086   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11087 
11088   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11089 
11090   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11091   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11092 
11093   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11094 
11095   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11096   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11097 }
11098 
11099 // Catch division cases where we can use shortcuts with rcp and rsq
11100 // instructions.
11101 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11102                                               SelectionDAG &DAG) const {
11103   SDLoc SL(Op);
11104   SDValue LHS = Op.getOperand(0);
11105   SDValue RHS = Op.getOperand(1);
11106   EVT VT = Op.getValueType();
11107   const SDNodeFlags Flags = Op->getFlags();
11108 
11109   bool AllowInaccurateRcp =
11110       Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11111 
11112   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11113     // Without !fpmath accuracy information, we can't do more because we don't
11114     // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11115     // f16 is always accurate enough
11116     if (!AllowInaccurateRcp && VT != MVT::f16)
11117       return SDValue();
11118 
11119     if (CLHS->isExactlyValue(1.0)) {
11120       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11121       // the CI documentation has a worst case error of 1 ulp.
11122       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11123       // use it as long as we aren't trying to use denormals.
11124       //
11125       // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11126 
11127       // 1.0 / sqrt(x) -> rsq(x)
11128 
11129       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
11130       // error seems really high at 2^29 ULP.
11131       // 1.0 / x -> rcp(x)
11132       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11133     }
11134 
11135     // Same as for 1.0, but expand the sign out of the constant.
11136     if (CLHS->isExactlyValue(-1.0)) {
11137       // -1.0 / x -> rcp (fneg x)
11138       SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11139       return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11140     }
11141   }
11142 
11143   // For f16 require afn or arcp.
11144   // For f32 require afn.
11145   if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
11146     return SDValue();
11147 
11148   // Turn into multiply by the reciprocal.
11149   // x / y -> x * (1.0 / y)
11150   SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11151   return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11152 }
11153 
11154 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11155                                                 SelectionDAG &DAG) const {
11156   SDLoc SL(Op);
11157   SDValue X = Op.getOperand(0);
11158   SDValue Y = Op.getOperand(1);
11159   EVT VT = Op.getValueType();
11160   const SDNodeFlags Flags = Op->getFlags();
11161 
11162   bool AllowInaccurateDiv =
11163       Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11164   if (!AllowInaccurateDiv)
11165     return SDValue();
11166 
11167   SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11168   SDValue One = DAG.getConstantFP(1.0, SL, VT);
11169 
11170   SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11171   SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11172 
11173   R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11174   SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11175   R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11176   SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11177   SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11178   return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11179 }
11180 
11181 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11182                           EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11183                           SDNodeFlags Flags) {
11184   if (GlueChain->getNumValues() <= 1) {
11185     return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11186   }
11187 
11188   assert(GlueChain->getNumValues() == 3);
11189 
11190   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11191   switch (Opcode) {
11192   default:
11193     llvm_unreachable("no chain equivalent for opcode");
11194   case ISD::FMUL:
11195     Opcode = AMDGPUISD::FMUL_W_CHAIN;
11196     break;
11197   }
11198 
11199   return DAG.getNode(Opcode, SL, VTList,
11200                      {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11201                      Flags);
11202 }
11203 
11204 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11205                            EVT VT, SDValue A, SDValue B, SDValue C,
11206                            SDValue GlueChain, SDNodeFlags Flags) {
11207   if (GlueChain->getNumValues() <= 1) {
11208     return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11209   }
11210 
11211   assert(GlueChain->getNumValues() == 3);
11212 
11213   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11214   switch (Opcode) {
11215   default:
11216     llvm_unreachable("no chain equivalent for opcode");
11217   case ISD::FMA:
11218     Opcode = AMDGPUISD::FMA_W_CHAIN;
11219     break;
11220   }
11221 
11222   return DAG.getNode(Opcode, SL, VTList,
11223                      {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11224                      Flags);
11225 }
11226 
11227 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11228   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11229     return FastLowered;
11230 
11231   SDLoc SL(Op);
11232   SDValue LHS = Op.getOperand(0);
11233   SDValue RHS = Op.getOperand(1);
11234 
11235   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11236   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11237   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11238   // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11239   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11240   // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11241   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11242   // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11243   // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11244   // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11245   // q16.u = opx(V_CVT_F16_F32, q32.u);
11246   // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11247 
11248   // We will use ISD::FMA on targets that don't support ISD::FMAD.
11249   unsigned FMADOpCode =
11250       isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
11251 
11252   SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11253   SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11254   SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11255   SDValue Rcp =
11256       DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11257   SDValue Quot =
11258       DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11259   SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11260                             Op->getFlags());
11261   Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11262   Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11263                     Op->getFlags());
11264   SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11265   SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11266   TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11267                         DAG.getConstant(0xff800000, SL, MVT::i32));
11268   Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11269   Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11270   SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11271                              DAG.getTargetConstant(0, SL, MVT::i32));
11272   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11273                      Op->getFlags());
11274 }
11275 
11276 // Faster 2.5 ULP division that does not support denormals.
11277 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11278   SDNodeFlags Flags = Op->getFlags();
11279   SDLoc SL(Op);
11280   SDValue LHS = Op.getOperand(1);
11281   SDValue RHS = Op.getOperand(2);
11282 
11283   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11284 
11285   const APFloat K0Val(0x1p+96f);
11286   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11287 
11288   const APFloat K1Val(0x1p-32f);
11289   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11290 
11291   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11292 
11293   EVT SetCCVT =
11294       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11295 
11296   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11297 
11298   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11299 
11300   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11301 
11302   // rcp does not support denormals.
11303   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11304 
11305   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11306 
11307   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11308 }
11309 
11310 // Returns immediate value for setting the F32 denorm mode when using the
11311 // S_DENORM_MODE instruction.
11312 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
11313                                     const SIMachineFunctionInfo *Info,
11314                                     const GCNSubtarget *ST) {
11315   assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11316   uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11317   uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11318   return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11319 }
11320 
11321 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11322   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11323     return FastLowered;
11324 
11325   // The selection matcher assumes anything with a chain selecting to a
11326   // mayRaiseFPException machine instruction. Since we're introducing a chain
11327   // here, we need to explicitly report nofpexcept for the regular fdiv
11328   // lowering.
11329   SDNodeFlags Flags = Op->getFlags();
11330   Flags.setNoFPExcept(true);
11331 
11332   SDLoc SL(Op);
11333   SDValue LHS = Op.getOperand(0);
11334   SDValue RHS = Op.getOperand(1);
11335 
11336   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11337 
11338   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11339 
11340   SDValue DenominatorScaled =
11341       DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11342   SDValue NumeratorScaled =
11343       DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11344 
11345   // Denominator is scaled to not be denormal, so using rcp is ok.
11346   SDValue ApproxRcp =
11347       DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11348   SDValue NegDivScale0 =
11349       DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11350 
11351   using namespace AMDGPU::Hwreg;
11352   const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11353   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11354 
11355   const MachineFunction &MF = DAG.getMachineFunction();
11356   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11357   const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11358 
11359   const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11360   const bool HasDynamicDenormals =
11361       (DenormMode.Input == DenormalMode::Dynamic) ||
11362       (DenormMode.Output == DenormalMode::Dynamic);
11363 
11364   SDValue SavedDenormMode;
11365 
11366   if (!PreservesDenormals) {
11367     // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11368     // lowering. The chain dependence is insufficient, and we need glue. We do
11369     // not need the glue variants in a strictfp function.
11370 
11371     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11372 
11373     SDValue Glue = DAG.getEntryNode();
11374     if (HasDynamicDenormals) {
11375       SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11376                                           DAG.getVTList(MVT::i32, MVT::Glue),
11377                                           {BitField, Glue});
11378       SavedDenormMode = SDValue(GetReg, 0);
11379 
11380       Glue = DAG.getMergeValues(
11381           {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11382     }
11383 
11384     SDNode *EnableDenorm;
11385     if (Subtarget->hasDenormModeInst()) {
11386       const SDValue EnableDenormValue =
11387           getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
11388 
11389       EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11390                                  EnableDenormValue)
11391                          .getNode();
11392     } else {
11393       const SDValue EnableDenormValue =
11394           DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11395       EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11396                                         {EnableDenormValue, BitField, Glue});
11397     }
11398 
11399     SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11400                       SDValue(EnableDenorm, 1)};
11401 
11402     NegDivScale0 = DAG.getMergeValues(Ops, SL);
11403   }
11404 
11405   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11406                              ApproxRcp, One, NegDivScale0, Flags);
11407 
11408   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11409                              ApproxRcp, Fma0, Flags);
11410 
11411   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11412                            Fma1, Flags);
11413 
11414   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11415                              NumeratorScaled, Mul, Flags);
11416 
11417   SDValue Fma3 =
11418       getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11419 
11420   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11421                              NumeratorScaled, Fma3, Flags);
11422 
11423   if (!PreservesDenormals) {
11424     SDNode *DisableDenorm;
11425     if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11426       const SDValue DisableDenormValue = getSPDenormModeValue(
11427           FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11428 
11429       SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11430       DisableDenorm =
11431           DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
11432                       Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
11433               .getNode();
11434     } else {
11435       assert(HasDynamicDenormals == (bool)SavedDenormMode);
11436       const SDValue DisableDenormValue =
11437           HasDynamicDenormals
11438               ? SavedDenormMode
11439               : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11440 
11441       DisableDenorm = DAG.getMachineNode(
11442           AMDGPU::S_SETREG_B32, SL, MVT::Other,
11443           {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11444     }
11445 
11446     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11447                                       SDValue(DisableDenorm, 0), DAG.getRoot());
11448     DAG.setRoot(OutputChain);
11449   }
11450 
11451   SDValue Scale = NumeratorScaled.getValue(1);
11452   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11453                              {Fma4, Fma1, Fma3, Scale}, Flags);
11454 
11455   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11456 }
11457 
11458 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11459   if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11460     return FastLowered;
11461 
11462   SDLoc SL(Op);
11463   SDValue X = Op.getOperand(0);
11464   SDValue Y = Op.getOperand(1);
11465 
11466   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11467 
11468   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11469 
11470   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11471 
11472   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11473 
11474   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11475 
11476   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11477 
11478   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11479 
11480   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11481 
11482   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11483 
11484   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11485   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11486 
11487   SDValue Fma4 =
11488       DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11489 
11490   SDValue Scale;
11491 
11492   if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11493     // Workaround a hardware bug on SI where the condition output from div_scale
11494     // is not usable.
11495 
11496     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11497 
11498     // Figure out if the scale to use for div_fmas.
11499     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11500     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11501     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11502     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11503 
11504     SDValue NumHi =
11505         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11506     SDValue DenHi =
11507         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11508 
11509     SDValue Scale0Hi =
11510         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11511     SDValue Scale1Hi =
11512         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11513 
11514     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11515     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11516     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11517   } else {
11518     Scale = DivScale1.getValue(1);
11519   }
11520 
11521   SDValue Fmas =
11522       DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11523 
11524   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11525 }
11526 
11527 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11528   EVT VT = Op.getValueType();
11529 
11530   if (VT == MVT::f32)
11531     return LowerFDIV32(Op, DAG);
11532 
11533   if (VT == MVT::f64)
11534     return LowerFDIV64(Op, DAG);
11535 
11536   if (VT == MVT::f16)
11537     return LowerFDIV16(Op, DAG);
11538 
11539   llvm_unreachable("Unexpected type for fdiv");
11540 }
11541 
11542 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11543   SDLoc dl(Op);
11544   SDValue Val = Op.getOperand(0);
11545   EVT VT = Val.getValueType();
11546   EVT ResultExpVT = Op->getValueType(1);
11547   EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11548 
11549   SDValue Mant = DAG.getNode(
11550       ISD::INTRINSIC_WO_CHAIN, dl, VT,
11551       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11552 
11553   SDValue Exp = DAG.getNode(
11554       ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11555       DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11556 
11557   if (Subtarget->hasFractBug()) {
11558     SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11559     SDValue Inf =
11560         DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT);
11561 
11562     SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11563     SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11564     Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11565     Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11566   }
11567 
11568   SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11569   return DAG.getMergeValues({Mant, CastExp}, dl);
11570 }
11571 
11572 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11573   SDLoc DL(Op);
11574   StoreSDNode *Store = cast<StoreSDNode>(Op);
11575   EVT VT = Store->getMemoryVT();
11576 
11577   if (VT == MVT::i1) {
11578     return DAG.getTruncStore(
11579         Store->getChain(), DL,
11580         DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11581         Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11582   }
11583 
11584   assert(VT.isVector() &&
11585          Store->getValue().getValueType().getScalarType() == MVT::i32);
11586 
11587   unsigned AS = Store->getAddressSpace();
11588   if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11589       Store->getAlign().value() < VT.getStoreSize() &&
11590       VT.getSizeInBits() > 32) {
11591     return SplitVectorStore(Op, DAG);
11592   }
11593 
11594   MachineFunction &MF = DAG.getMachineFunction();
11595   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11596   // If there is a possibility that flat instruction access scratch memory
11597   // then we need to use the same legalization rules we use for private.
11598   if (AS == AMDGPUAS::FLAT_ADDRESS &&
11599       !Subtarget->hasMultiDwordFlatScratchAddressing())
11600     AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11601              ? AMDGPUAS::PRIVATE_ADDRESS
11602              : AMDGPUAS::GLOBAL_ADDRESS;
11603 
11604   unsigned NumElements = VT.getVectorNumElements();
11605   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11606     if (NumElements > 4)
11607       return SplitVectorStore(Op, DAG);
11608     // v3 stores not supported on SI.
11609     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11610       return SplitVectorStore(Op, DAG);
11611 
11612     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
11613                                         VT, *Store->getMemOperand()))
11614       return expandUnalignedStore(Store, DAG);
11615 
11616     return SDValue();
11617   }
11618   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11619     switch (Subtarget->getMaxPrivateElementSize()) {
11620     case 4:
11621       return scalarizeVectorStore(Store, DAG);
11622     case 8:
11623       if (NumElements > 2)
11624         return SplitVectorStore(Op, DAG);
11625       return SDValue();
11626     case 16:
11627       if (NumElements > 4 ||
11628           (NumElements == 3 && !Subtarget->enableFlatScratch()))
11629         return SplitVectorStore(Op, DAG);
11630       return SDValue();
11631     default:
11632       llvm_unreachable("unsupported private_element_size");
11633     }
11634   } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11635     unsigned Fast = 0;
11636     auto Flags = Store->getMemOperand()->getFlags();
11637     if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
11638                                            Store->getAlign(), Flags, &Fast) &&
11639         Fast > 1)
11640       return SDValue();
11641 
11642     if (VT.isVector())
11643       return SplitVectorStore(Op, DAG);
11644 
11645     return expandUnalignedStore(Store, DAG);
11646   }
11647 
11648   // Probably an invalid store. If so we'll end up emitting a selection error.
11649   return SDValue();
11650 }
11651 
11652 // Avoid the full correct expansion for f32 sqrt when promoting from f16.
11653 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11654   SDLoc SL(Op);
11655   assert(!Subtarget->has16BitInsts());
11656   SDNodeFlags Flags = Op->getFlags();
11657   SDValue Ext =
11658       DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11659 
11660   SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11661   SDValue Sqrt =
11662       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11663 
11664   return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11665                      DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11666 }
11667 
11668 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11669   SDLoc DL(Op);
11670   SDNodeFlags Flags = Op->getFlags();
11671   MVT VT = Op.getValueType().getSimpleVT();
11672   const SDValue X = Op.getOperand(0);
11673 
11674   if (allowApproxFunc(DAG, Flags)) {
11675     // Instruction is 1ulp but ignores denormals.
11676     return DAG.getNode(
11677         ISD::INTRINSIC_WO_CHAIN, DL, VT,
11678         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11679   }
11680 
11681   SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11682   SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11683 
11684   SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11685 
11686   SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11687 
11688   SDValue SqrtX =
11689       DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11690 
11691   SDValue SqrtS;
11692   if (needsDenormHandlingF32(DAG, X, Flags)) {
11693     SDValue SqrtID =
11694         DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11695     SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11696 
11697     SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11698     SDValue SqrtSNextDownInt =
11699         DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11700                     DAG.getAllOnesConstant(DL, MVT::i32));
11701     SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11702 
11703     SDValue NegSqrtSNextDown =
11704         DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11705 
11706     SDValue SqrtVP =
11707         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11708 
11709     SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11710                                          DAG.getConstant(1, DL, MVT::i32));
11711     SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11712 
11713     SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11714     SDValue SqrtVS =
11715         DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11716 
11717     SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11718     SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11719 
11720     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11721                         Flags);
11722 
11723     SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11724     SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11725                         Flags);
11726   } else {
11727     SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11728 
11729     SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11730 
11731     SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11732     SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11733     SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11734 
11735     SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11736     SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11737     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11738 
11739     SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11740     SDValue SqrtD =
11741         DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11742     SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11743   }
11744 
11745   SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11746 
11747   SDValue ScaledDown =
11748       DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11749 
11750   SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11751   SDValue IsZeroOrInf =
11752       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11753                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11754 
11755   return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11756 }
11757 
11758 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11759   // For double type, the SQRT and RSQ instructions don't have required
11760   // precision, we apply Goldschmidt's algorithm to improve the result:
11761   //
11762   //   y0 = rsq(x)
11763   //   g0 = x * y0
11764   //   h0 = 0.5 * y0
11765   //
11766   //   r0 = 0.5 - h0 * g0
11767   //   g1 = g0 * r0 + g0
11768   //   h1 = h0 * r0 + h0
11769   //
11770   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11771   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
11772   //   h2 = h1 * r1 + h1
11773   //
11774   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11775   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
11776   //
11777   //   sqrt(x) = g3
11778 
11779   SDNodeFlags Flags = Op->getFlags();
11780 
11781   SDLoc DL(Op);
11782 
11783   SDValue X = Op.getOperand(0);
11784   SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11785 
11786   SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11787 
11788   SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11789 
11790   // Scale up input if it is too small.
11791   SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11792   SDValue ScaleUp =
11793       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11794   SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11795 
11796   SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11797 
11798   SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11799 
11800   SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11801   SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11802 
11803   SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11804   SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11805 
11806   SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11807 
11808   SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11809 
11810   SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11811   SDValue SqrtD0 =
11812       DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11813 
11814   SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11815 
11816   SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11817   SDValue SqrtD1 =
11818       DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11819 
11820   SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11821 
11822   SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
11823   SDValue ScaleDown =
11824       DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11825   SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11826 
11827   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11828   // with finite only or nsz because rsq(+/-0) = +/-inf
11829 
11830   // TODO: Check for DAZ and expand to subnormals
11831   SDValue IsZeroOrInf =
11832       DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11833                   DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11834 
11835   // If x is +INF, +0, or -0, use its original value
11836   return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11837                      Flags);
11838 }
11839 
11840 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11841   SDLoc DL(Op);
11842   EVT VT = Op.getValueType();
11843   SDValue Arg = Op.getOperand(0);
11844   SDValue TrigVal;
11845 
11846   // Propagate fast-math flags so that the multiply we introduce can be folded
11847   // if Arg is already the result of a multiply by constant.
11848   auto Flags = Op->getFlags();
11849 
11850   SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11851 
11852   if (Subtarget->hasTrigReducedRange()) {
11853     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11854     TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11855   } else {
11856     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11857   }
11858 
11859   switch (Op.getOpcode()) {
11860   case ISD::FCOS:
11861     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11862   case ISD::FSIN:
11863     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11864   default:
11865     llvm_unreachable("Wrong trig opcode");
11866   }
11867 }
11868 
11869 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11870                                                SelectionDAG &DAG) const {
11871   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11872   assert(AtomicNode->isCompareAndSwap());
11873   unsigned AS = AtomicNode->getAddressSpace();
11874 
11875   // No custom lowering required for local address space
11876   if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11877     return Op;
11878 
11879   // Non-local address space requires custom lowering for atomic compare
11880   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11881   SDLoc DL(Op);
11882   SDValue ChainIn = Op.getOperand(0);
11883   SDValue Addr = Op.getOperand(1);
11884   SDValue Old = Op.getOperand(2);
11885   SDValue New = Op.getOperand(3);
11886   EVT VT = Op.getValueType();
11887   MVT SimpleVT = VT.getSimpleVT();
11888   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11889 
11890   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11891   SDValue Ops[] = {ChainIn, Addr, NewOld};
11892 
11893   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
11894                                  Op->getVTList(), Ops, VT,
11895                                  AtomicNode->getMemOperand());
11896 }
11897 
11898 //===----------------------------------------------------------------------===//
11899 // Custom DAG optimizations
11900 //===----------------------------------------------------------------------===//
11901 
11902 SDValue
11903 SITargetLowering::performUCharToFloatCombine(SDNode *N,
11904                                              DAGCombinerInfo &DCI) const {
11905   EVT VT = N->getValueType(0);
11906   EVT ScalarVT = VT.getScalarType();
11907   if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11908     return SDValue();
11909 
11910   SelectionDAG &DAG = DCI.DAG;
11911   SDLoc DL(N);
11912 
11913   SDValue Src = N->getOperand(0);
11914   EVT SrcVT = Src.getValueType();
11915 
11916   // TODO: We could try to match extracting the higher bytes, which would be
11917   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11918   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11919   // about in practice.
11920   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11921     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11922       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11923       DCI.AddToWorklist(Cvt.getNode());
11924 
11925       // For the f16 case, fold to a cast to f32 and then cast back to f16.
11926       if (ScalarVT != MVT::f32) {
11927         Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11928                           DAG.getTargetConstant(0, DL, MVT::i32));
11929       }
11930       return Cvt;
11931     }
11932   }
11933 
11934   return SDValue();
11935 }
11936 
11937 SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11938                                                   DAGCombinerInfo &DCI) const {
11939   SDValue MagnitudeOp = N->getOperand(0);
11940   SDValue SignOp = N->getOperand(1);
11941 
11942   // The generic combine for fcopysign + fp cast is too conservative with
11943   // vectors, and also gets confused by the splitting we will perform here, so
11944   // peek through FP casts.
11945   if (SignOp.getOpcode() == ISD::FP_EXTEND ||
11946       SignOp.getOpcode() == ISD::FP_ROUND)
11947     SignOp = SignOp.getOperand(0);
11948 
11949   SelectionDAG &DAG = DCI.DAG;
11950   SDLoc DL(N);
11951   EVT SignVT = SignOp.getValueType();
11952 
11953   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11954   // lower half with a copy.
11955   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11956   EVT MagVT = MagnitudeOp.getValueType();
11957 
11958   unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11959 
11960   if (MagVT.getScalarType() == MVT::f64) {
11961     EVT F32VT = MagVT.isVector()
11962                     ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11963                     : MVT::v2f32;
11964 
11965     SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
11966 
11967     SmallVector<SDValue, 8> NewElts;
11968     for (unsigned I = 0; I != NumElts; ++I) {
11969       SDValue MagLo =
11970           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11971                       DAG.getConstant(2 * I, DL, MVT::i32));
11972       SDValue MagHi =
11973           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11974                       DAG.getConstant(2 * I + 1, DL, MVT::i32));
11975 
11976       SDValue SignOpElt =
11977           MagVT.isVector()
11978               ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),
11979                             SignOp, DAG.getConstant(I, DL, MVT::i32))
11980               : SignOp;
11981 
11982       SDValue HiOp =
11983           DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
11984 
11985       SDValue Vector =
11986           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11987 
11988       SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11989       NewElts.push_back(NewElt);
11990     }
11991 
11992     if (NewElts.size() == 1)
11993       return NewElts[0];
11994 
11995     return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
11996   }
11997 
11998   if (SignVT.getScalarType() != MVT::f64)
11999     return SDValue();
12000 
12001   // Reduce width of sign operand, we only need the highest bit.
12002   //
12003   // fcopysign f64:x, f64:y ->
12004   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12005   // TODO: In some cases it might make sense to go all the way to f16.
12006 
12007   EVT F32VT = MagVT.isVector()
12008                   ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12009                   : MVT::v2f32;
12010 
12011   SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12012 
12013   SmallVector<SDValue, 8> F32Signs;
12014   for (unsigned I = 0; I != NumElts; ++I) {
12015     // Take sign from odd elements of cast vector
12016     SDValue SignAsF32 =
12017         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12018                     DAG.getConstant(2 * I + 1, DL, MVT::i32));
12019     F32Signs.push_back(SignAsF32);
12020   }
12021 
12022   SDValue NewSign =
12023       NumElts == 1
12024           ? F32Signs.back()
12025           : DAG.getNode(ISD::BUILD_VECTOR, DL,
12026                         EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12027                         F32Signs);
12028 
12029   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12030                      NewSign);
12031 }
12032 
12033 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12034 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12035 // bits
12036 
12037 // This is a variant of
12038 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12039 //
12040 // The normal DAG combiner will do this, but only if the add has one use since
12041 // that would increase the number of instructions.
12042 //
12043 // This prevents us from seeing a constant offset that can be folded into a
12044 // memory instruction's addressing mode. If we know the resulting add offset of
12045 // a pointer can be folded into an addressing offset, we can replace the pointer
12046 // operand with the add of new constant offset. This eliminates one of the uses,
12047 // and may allow the remaining use to also be simplified.
12048 //
12049 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12050                                                EVT MemVT,
12051                                                DAGCombinerInfo &DCI) const {
12052   SDValue N0 = N->getOperand(0);
12053   SDValue N1 = N->getOperand(1);
12054 
12055   // We only do this to handle cases where it's profitable when there are
12056   // multiple uses of the add, so defer to the standard combine.
12057   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12058       N0->hasOneUse())
12059     return SDValue();
12060 
12061   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12062   if (!CN1)
12063     return SDValue();
12064 
12065   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12066   if (!CAdd)
12067     return SDValue();
12068 
12069   SelectionDAG &DAG = DCI.DAG;
12070 
12071   if (N0->getOpcode() == ISD::OR &&
12072       !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12073     return SDValue();
12074 
12075   // If the resulting offset is too large, we can't fold it into the
12076   // addressing mode offset.
12077   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12078   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12079 
12080   AddrMode AM;
12081   AM.HasBaseReg = true;
12082   AM.BaseOffs = Offset.getSExtValue();
12083   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12084     return SDValue();
12085 
12086   SDLoc SL(N);
12087   EVT VT = N->getValueType(0);
12088 
12089   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12090   SDValue COffset = DAG.getConstant(Offset, SL, VT);
12091 
12092   SDNodeFlags Flags;
12093   Flags.setNoUnsignedWrap(
12094       N->getFlags().hasNoUnsignedWrap() &&
12095       (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12096 
12097   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12098 }
12099 
12100 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12101 /// by the chain and intrinsic ID. Theoretically we would also need to check the
12102 /// specific intrinsic, but they all place the pointer operand first.
12103 static unsigned getBasePtrIndex(const MemSDNode *N) {
12104   switch (N->getOpcode()) {
12105   case ISD::STORE:
12106   case ISD::INTRINSIC_W_CHAIN:
12107   case ISD::INTRINSIC_VOID:
12108     return 2;
12109   default:
12110     return 1;
12111   }
12112 }
12113 
12114 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12115                                                   DAGCombinerInfo &DCI) const {
12116   SelectionDAG &DAG = DCI.DAG;
12117 
12118   unsigned PtrIdx = getBasePtrIndex(N);
12119   SDValue Ptr = N->getOperand(PtrIdx);
12120 
12121   // TODO: We could also do this for multiplies.
12122   if (Ptr.getOpcode() == ISD::SHL) {
12123     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12124                                           N->getMemoryVT(), DCI);
12125     if (NewPtr) {
12126       SmallVector<SDValue, 8> NewOps(N->ops());
12127 
12128       NewOps[PtrIdx] = NewPtr;
12129       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12130     }
12131   }
12132 
12133   return SDValue();
12134 }
12135 
12136 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12137   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12138          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12139          (Opc == ISD::XOR && Val == 0);
12140 }
12141 
12142 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12143 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12144 // integer combine opportunities since most 64-bit operations are decomposed
12145 // this way.  TODO: We won't want this for SALU especially if it is an inline
12146 // immediate.
12147 SDValue SITargetLowering::splitBinaryBitConstantOp(
12148     DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12149     const ConstantSDNode *CRHS) const {
12150   uint64_t Val = CRHS->getZExtValue();
12151   uint32_t ValLo = Lo_32(Val);
12152   uint32_t ValHi = Hi_32(Val);
12153   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12154 
12155   if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12156        bitOpWithConstantIsReducible(Opc, ValHi)) ||
12157       (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12158     // We have 64-bit scalar and/or/xor, but do not have vector forms.
12159     if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12160         !CRHS->user_begin()->isDivergent())
12161       return SDValue();
12162 
12163     // If we need to materialize a 64-bit immediate, it will be split up later
12164     // anyway. Avoid creating the harder to understand 64-bit immediate
12165     // materialization.
12166     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12167   }
12168 
12169   return SDValue();
12170 }
12171 
12172 bool llvm::isBoolSGPR(SDValue V) {
12173   if (V.getValueType() != MVT::i1)
12174     return false;
12175   switch (V.getOpcode()) {
12176   default:
12177     break;
12178   case ISD::SETCC:
12179   case ISD::IS_FPCLASS:
12180   case AMDGPUISD::FP_CLASS:
12181     return true;
12182   case ISD::AND:
12183   case ISD::OR:
12184   case ISD::XOR:
12185     return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12186   case ISD::SADDO:
12187   case ISD::UADDO:
12188   case ISD::SSUBO:
12189   case ISD::USUBO:
12190   case ISD::SMULO:
12191   case ISD::UMULO:
12192     return V.getResNo() == 1;
12193   case ISD::INTRINSIC_WO_CHAIN: {
12194     unsigned IntrinsicID = V.getConstantOperandVal(0);
12195     switch (IntrinsicID) {
12196     case Intrinsic::amdgcn_is_shared:
12197     case Intrinsic::amdgcn_is_private:
12198       return true;
12199     default:
12200       return false;
12201     }
12202 
12203     return false;
12204   }
12205   }
12206   return false;
12207 }
12208 
12209 // If a constant has all zeroes or all ones within each byte return it.
12210 // Otherwise return 0.
12211 static uint32_t getConstantPermuteMask(uint32_t C) {
12212   // 0xff for any zero byte in the mask
12213   uint32_t ZeroByteMask = 0;
12214   if (!(C & 0x000000ff))
12215     ZeroByteMask |= 0x000000ff;
12216   if (!(C & 0x0000ff00))
12217     ZeroByteMask |= 0x0000ff00;
12218   if (!(C & 0x00ff0000))
12219     ZeroByteMask |= 0x00ff0000;
12220   if (!(C & 0xff000000))
12221     ZeroByteMask |= 0xff000000;
12222   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12223   if ((NonZeroByteMask & C) != NonZeroByteMask)
12224     return 0; // Partial bytes selected.
12225   return C;
12226 }
12227 
12228 // Check if a node selects whole bytes from its operand 0 starting at a byte
12229 // boundary while masking the rest. Returns select mask as in the v_perm_b32
12230 // or -1 if not succeeded.
12231 // Note byte select encoding:
12232 // value 0-3 selects corresponding source byte;
12233 // value 0xc selects zero;
12234 // value 0xff selects 0xff.
12235 static uint32_t getPermuteMask(SDValue V) {
12236   assert(V.getValueSizeInBits() == 32);
12237 
12238   if (V.getNumOperands() != 2)
12239     return ~0;
12240 
12241   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12242   if (!N1)
12243     return ~0;
12244 
12245   uint32_t C = N1->getZExtValue();
12246 
12247   switch (V.getOpcode()) {
12248   default:
12249     break;
12250   case ISD::AND:
12251     if (uint32_t ConstMask = getConstantPermuteMask(C))
12252       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12253     break;
12254 
12255   case ISD::OR:
12256     if (uint32_t ConstMask = getConstantPermuteMask(C))
12257       return (0x03020100 & ~ConstMask) | ConstMask;
12258     break;
12259 
12260   case ISD::SHL:
12261     if (C % 8)
12262       return ~0;
12263 
12264     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12265 
12266   case ISD::SRL:
12267     if (C % 8)
12268       return ~0;
12269 
12270     return uint32_t(0x0c0c0c0c03020100ull >> C);
12271   }
12272 
12273   return ~0;
12274 }
12275 
12276 SDValue SITargetLowering::performAndCombine(SDNode *N,
12277                                             DAGCombinerInfo &DCI) const {
12278   if (DCI.isBeforeLegalize())
12279     return SDValue();
12280 
12281   SelectionDAG &DAG = DCI.DAG;
12282   EVT VT = N->getValueType(0);
12283   SDValue LHS = N->getOperand(0);
12284   SDValue RHS = N->getOperand(1);
12285 
12286   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12287   if (VT == MVT::i64 && CRHS) {
12288     if (SDValue Split =
12289             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12290       return Split;
12291   }
12292 
12293   if (CRHS && VT == MVT::i32) {
12294     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12295     // nb = number of trailing zeroes in mask
12296     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12297     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12298     uint64_t Mask = CRHS->getZExtValue();
12299     unsigned Bits = llvm::popcount(Mask);
12300     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12301         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12302       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12303         unsigned Shift = CShift->getZExtValue();
12304         unsigned NB = CRHS->getAPIntValue().countr_zero();
12305         unsigned Offset = NB + Shift;
12306         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12307           SDLoc SL(N);
12308           SDValue BFE =
12309               DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12310                           DAG.getConstant(Offset, SL, MVT::i32),
12311                           DAG.getConstant(Bits, SL, MVT::i32));
12312           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12313           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12314                                     DAG.getValueType(NarrowVT));
12315           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12316                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12317           return Shl;
12318         }
12319       }
12320     }
12321 
12322     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12323     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12324         isa<ConstantSDNode>(LHS.getOperand(2))) {
12325       uint32_t Sel = getConstantPermuteMask(Mask);
12326       if (!Sel)
12327         return SDValue();
12328 
12329       // Select 0xc for all zero bytes
12330       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12331       SDLoc DL(N);
12332       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12333                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12334     }
12335   }
12336 
12337   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12338   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12339   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12340     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12341     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12342 
12343     SDValue X = LHS.getOperand(0);
12344     SDValue Y = RHS.getOperand(0);
12345     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12346         !isTypeLegal(X.getValueType()))
12347       return SDValue();
12348 
12349     if (LCC == ISD::SETO) {
12350       if (X != LHS.getOperand(1))
12351         return SDValue();
12352 
12353       if (RCC == ISD::SETUNE) {
12354         const ConstantFPSDNode *C1 =
12355             dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12356         if (!C1 || !C1->isInfinity() || C1->isNegative())
12357           return SDValue();
12358 
12359         const uint32_t Mask = SIInstrFlags::N_NORMAL |
12360                               SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
12361                               SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
12362                               SIInstrFlags::P_NORMAL;
12363 
12364         static_assert(
12365             ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
12366                 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
12367              0x3ff) == Mask,
12368             "mask not equal");
12369 
12370         SDLoc DL(N);
12371         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12372                            DAG.getConstant(Mask, DL, MVT::i32));
12373       }
12374     }
12375   }
12376 
12377   if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12378     std::swap(LHS, RHS);
12379 
12380   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12381       RHS.hasOneUse()) {
12382     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12383     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12384     // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12385     // | n_nan)
12386     const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12387     if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12388         (RHS.getOperand(0) == LHS.getOperand(0) &&
12389          LHS.getOperand(0) == LHS.getOperand(1))) {
12390       const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12391       unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12392                                           : Mask->getZExtValue() & OrdMask;
12393 
12394       SDLoc DL(N);
12395       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12396                          DAG.getConstant(NewMask, DL, MVT::i32));
12397     }
12398   }
12399 
12400   if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12401                          LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12402     // and x, (sext cc from i1) => select cc, x, 0
12403     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12404       std::swap(LHS, RHS);
12405     if (isBoolSGPR(RHS.getOperand(0)))
12406       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12407                            DAG.getConstant(0, SDLoc(N), MVT::i32));
12408   }
12409 
12410   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12411   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12412   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12413       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12414     uint32_t LHSMask = getPermuteMask(LHS);
12415     uint32_t RHSMask = getPermuteMask(RHS);
12416     if (LHSMask != ~0u && RHSMask != ~0u) {
12417       // Canonicalize the expression in an attempt to have fewer unique masks
12418       // and therefore fewer registers used to hold the masks.
12419       if (LHSMask > RHSMask) {
12420         std::swap(LHSMask, RHSMask);
12421         std::swap(LHS, RHS);
12422       }
12423 
12424       // Select 0xc for each lane used from source operand. Zero has 0xc mask
12425       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12426       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12427       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12428 
12429       // Check of we need to combine values from two sources within a byte.
12430       if (!(LHSUsedLanes & RHSUsedLanes) &&
12431           // If we select high and lower word keep it for SDWA.
12432           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12433           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12434         // Each byte in each mask is either selector mask 0-3, or has higher
12435         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12436         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12437         // mask which is not 0xff wins. By anding both masks we have a correct
12438         // result except that 0x0c shall be corrected to give 0x0c only.
12439         uint32_t Mask = LHSMask & RHSMask;
12440         for (unsigned I = 0; I < 32; I += 8) {
12441           uint32_t ByteSel = 0xff << I;
12442           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12443             Mask &= (0x0c << I) & 0xffffffff;
12444         }
12445 
12446         // Add 4 to each active LHS lane. It will not affect any existing 0xff
12447         // or 0x0c.
12448         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12449         SDLoc DL(N);
12450 
12451         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12452                            RHS.getOperand(0),
12453                            DAG.getConstant(Sel, DL, MVT::i32));
12454       }
12455     }
12456   }
12457 
12458   return SDValue();
12459 }
12460 
12461 // A key component of v_perm is a mapping between byte position of the src
12462 // operands, and the byte position of the dest. To provide such, we need: 1. the
12463 // node that provides x byte of the dest of the OR, and 2. the byte of the node
12464 // used to provide that x byte. calculateByteProvider finds which node provides
12465 // a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12466 // and finds an ultimate src and byte position For example: The supported
12467 // LoadCombine pattern for vector loads is as follows
12468 //                                t1
12469 //                                or
12470 //                      /                  \
12471 //                      t2                 t3
12472 //                     zext                shl
12473 //                      |                   |     \
12474 //                     t4                  t5     16
12475 //                     or                 anyext
12476 //                 /        \               |
12477 //                t6        t7             t8
12478 //               srl        shl             or
12479 //            /    |      /     \         /     \
12480 //           t9   t10    t11   t12      t13    t14
12481 //         trunc*  8    trunc*  8      and     and
12482 //           |            |          /    |     |    \
12483 //          t15          t16        t17  t18   t19   t20
12484 //                                trunc*  255   srl   -256
12485 //                                   |         /   \
12486 //                                  t15       t15  16
12487 //
12488 // *In this example, the truncs are from i32->i16
12489 //
12490 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12491 // respectively. calculateSrcByte would find (given node) -> ultimate src &
12492 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12493 // After finding the mapping, we can combine the tree into vperm t15, t16,
12494 // 0x05000407
12495 
12496 // Find the source and byte position from a node.
12497 // \p DestByte is the byte position of the dest of the or that the src
12498 // ultimately provides. \p SrcIndex is the byte of the src that maps to this
12499 // dest of the or byte. \p Depth tracks how many recursive iterations we have
12500 // performed.
12501 static const std::optional<ByteProvider<SDValue>>
12502 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12503                  unsigned Depth = 0) {
12504   // We may need to recursively traverse a series of SRLs
12505   if (Depth >= 6)
12506     return std::nullopt;
12507 
12508   if (Op.getValueSizeInBits() < 8)
12509     return std::nullopt;
12510 
12511   if (Op.getValueType().isVector())
12512     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12513 
12514   switch (Op->getOpcode()) {
12515   case ISD::TRUNCATE: {
12516     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12517   }
12518 
12519   case ISD::SIGN_EXTEND:
12520   case ISD::ZERO_EXTEND:
12521   case ISD::SIGN_EXTEND_INREG: {
12522     SDValue NarrowOp = Op->getOperand(0);
12523     auto NarrowVT = NarrowOp.getValueType();
12524     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12525       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12526       NarrowVT = VTSign->getVT();
12527     }
12528     if (!NarrowVT.isByteSized())
12529       return std::nullopt;
12530     uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12531 
12532     if (SrcIndex >= NarrowByteWidth)
12533       return std::nullopt;
12534     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12535   }
12536 
12537   case ISD::SRA:
12538   case ISD::SRL: {
12539     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12540     if (!ShiftOp)
12541       return std::nullopt;
12542 
12543     uint64_t BitShift = ShiftOp->getZExtValue();
12544 
12545     if (BitShift % 8 != 0)
12546       return std::nullopt;
12547 
12548     SrcIndex += BitShift / 8;
12549 
12550     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12551   }
12552 
12553   default: {
12554     return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12555   }
12556   }
12557   llvm_unreachable("fully handled switch");
12558 }
12559 
12560 // For a byte position in the result of an Or, traverse the tree and find the
12561 // node (and the byte of the node) which ultimately provides this {Or,
12562 // BytePosition}. \p Op is the operand we are currently examining. \p Index is
12563 // the byte position of the Op that corresponds with the originally requested
12564 // byte of the Or \p Depth tracks how many recursive iterations we have
12565 // performed. \p StartingIndex is the originally requested byte of the Or
12566 static const std::optional<ByteProvider<SDValue>>
12567 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12568                       unsigned StartingIndex = 0) {
12569   // Finding Src tree of RHS of or typically requires at least 1 additional
12570   // depth
12571   if (Depth > 6)
12572     return std::nullopt;
12573 
12574   unsigned BitWidth = Op.getScalarValueSizeInBits();
12575   if (BitWidth % 8 != 0)
12576     return std::nullopt;
12577   if (Index > BitWidth / 8 - 1)
12578     return std::nullopt;
12579 
12580   bool IsVec = Op.getValueType().isVector();
12581   switch (Op.getOpcode()) {
12582   case ISD::OR: {
12583     if (IsVec)
12584       return std::nullopt;
12585 
12586     auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12587                                      StartingIndex);
12588     if (!RHS)
12589       return std::nullopt;
12590     auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12591                                      StartingIndex);
12592     if (!LHS)
12593       return std::nullopt;
12594     // A well formed Or will have two ByteProviders for each byte, one of which
12595     // is constant zero
12596     if (!LHS->isConstantZero() && !RHS->isConstantZero())
12597       return std::nullopt;
12598     if (!LHS || LHS->isConstantZero())
12599       return RHS;
12600     if (!RHS || RHS->isConstantZero())
12601       return LHS;
12602     return std::nullopt;
12603   }
12604 
12605   case ISD::AND: {
12606     if (IsVec)
12607       return std::nullopt;
12608 
12609     auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12610     if (!BitMaskOp)
12611       return std::nullopt;
12612 
12613     uint32_t BitMask = BitMaskOp->getZExtValue();
12614     // Bits we expect for our StartingIndex
12615     uint32_t IndexMask = 0xFF << (Index * 8);
12616 
12617     if ((IndexMask & BitMask) != IndexMask) {
12618       // If the result of the and partially provides the byte, then it
12619       // is not well formatted
12620       if (IndexMask & BitMask)
12621         return std::nullopt;
12622       return ByteProvider<SDValue>::getConstantZero();
12623     }
12624 
12625     return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
12626   }
12627 
12628   case ISD::FSHR: {
12629     if (IsVec)
12630       return std::nullopt;
12631 
12632     // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12633     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12634     if (!ShiftOp || Op.getValueType().isVector())
12635       return std::nullopt;
12636 
12637     uint64_t BitsProvided = Op.getValueSizeInBits();
12638     if (BitsProvided % 8 != 0)
12639       return std::nullopt;
12640 
12641     uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
12642     if (BitShift % 8)
12643       return std::nullopt;
12644 
12645     uint64_t ConcatSizeInBytes = BitsProvided / 4;
12646     uint64_t ByteShift = BitShift / 8;
12647 
12648     uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12649     uint64_t BytesProvided = BitsProvided / 8;
12650     SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
12651     NewIndex %= BytesProvided;
12652     return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
12653   }
12654 
12655   case ISD::SRA:
12656   case ISD::SRL: {
12657     if (IsVec)
12658       return std::nullopt;
12659 
12660     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12661     if (!ShiftOp)
12662       return std::nullopt;
12663 
12664     uint64_t BitShift = ShiftOp->getZExtValue();
12665     if (BitShift % 8)
12666       return std::nullopt;
12667 
12668     auto BitsProvided = Op.getScalarValueSizeInBits();
12669     if (BitsProvided % 8 != 0)
12670       return std::nullopt;
12671 
12672     uint64_t BytesProvided = BitsProvided / 8;
12673     uint64_t ByteShift = BitShift / 8;
12674     // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12675     // If the byte we are trying to provide (as tracked by index) falls in this
12676     // range, then the SRL provides the byte. The byte of interest of the src of
12677     // the SRL is Index + ByteShift
12678     return BytesProvided - ByteShift > Index
12679                ? calculateSrcByte(Op->getOperand(0), StartingIndex,
12680                                   Index + ByteShift)
12681                : ByteProvider<SDValue>::getConstantZero();
12682   }
12683 
12684   case ISD::SHL: {
12685     if (IsVec)
12686       return std::nullopt;
12687 
12688     auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12689     if (!ShiftOp)
12690       return std::nullopt;
12691 
12692     uint64_t BitShift = ShiftOp->getZExtValue();
12693     if (BitShift % 8 != 0)
12694       return std::nullopt;
12695     uint64_t ByteShift = BitShift / 8;
12696 
12697     // If we are shifting by an amount greater than (or equal to)
12698     // the index we are trying to provide, then it provides 0s. If not,
12699     // then this bytes are not definitively 0s, and the corresponding byte
12700     // of interest is Index - ByteShift of the src
12701     return Index < ByteShift
12702                ? ByteProvider<SDValue>::getConstantZero()
12703                : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
12704                                        Depth + 1, StartingIndex);
12705   }
12706   case ISD::ANY_EXTEND:
12707   case ISD::SIGN_EXTEND:
12708   case ISD::ZERO_EXTEND:
12709   case ISD::SIGN_EXTEND_INREG:
12710   case ISD::AssertZext:
12711   case ISD::AssertSext: {
12712     if (IsVec)
12713       return std::nullopt;
12714 
12715     SDValue NarrowOp = Op->getOperand(0);
12716     unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12717     if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12718         Op->getOpcode() == ISD::AssertZext ||
12719         Op->getOpcode() == ISD::AssertSext) {
12720       auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12721       NarrowBitWidth = VTSign->getVT().getSizeInBits();
12722     }
12723     if (NarrowBitWidth % 8 != 0)
12724       return std::nullopt;
12725     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12726 
12727     if (Index >= NarrowByteWidth)
12728       return Op.getOpcode() == ISD::ZERO_EXTEND
12729                  ? std::optional<ByteProvider<SDValue>>(
12730                        ByteProvider<SDValue>::getConstantZero())
12731                  : std::nullopt;
12732     return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12733   }
12734 
12735   case ISD::TRUNCATE: {
12736     if (IsVec)
12737       return std::nullopt;
12738 
12739     uint64_t NarrowByteWidth = BitWidth / 8;
12740 
12741     if (NarrowByteWidth >= Index) {
12742       return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12743                                    StartingIndex);
12744     }
12745 
12746     return std::nullopt;
12747   }
12748 
12749   case ISD::CopyFromReg: {
12750     if (BitWidth / 8 > Index)
12751       return calculateSrcByte(Op, StartingIndex, Index);
12752 
12753     return std::nullopt;
12754   }
12755 
12756   case ISD::LOAD: {
12757     auto *L = cast<LoadSDNode>(Op.getNode());
12758 
12759     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12760     if (NarrowBitWidth % 8 != 0)
12761       return std::nullopt;
12762     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12763 
12764     // If the width of the load does not reach byte we are trying to provide for
12765     // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12766     // question
12767     if (Index >= NarrowByteWidth) {
12768       return L->getExtensionType() == ISD::ZEXTLOAD
12769                  ? std::optional<ByteProvider<SDValue>>(
12770                        ByteProvider<SDValue>::getConstantZero())
12771                  : std::nullopt;
12772     }
12773 
12774     if (NarrowByteWidth > Index) {
12775       return calculateSrcByte(Op, StartingIndex, Index);
12776     }
12777 
12778     return std::nullopt;
12779   }
12780 
12781   case ISD::BSWAP: {
12782     if (IsVec)
12783       return std::nullopt;
12784 
12785     return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12786                                  Depth + 1, StartingIndex);
12787   }
12788 
12789   case ISD::EXTRACT_VECTOR_ELT: {
12790     auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12791     if (!IdxOp)
12792       return std::nullopt;
12793     auto VecIdx = IdxOp->getZExtValue();
12794     auto ScalarSize = Op.getScalarValueSizeInBits();
12795     if (ScalarSize < 32)
12796       Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12797     return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12798                             StartingIndex, Index);
12799   }
12800 
12801   case AMDGPUISD::PERM: {
12802     if (IsVec)
12803       return std::nullopt;
12804 
12805     auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12806     if (!PermMask)
12807       return std::nullopt;
12808 
12809     auto IdxMask =
12810         (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12811     if (IdxMask > 0x07 && IdxMask != 0x0c)
12812       return std::nullopt;
12813 
12814     auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12815     auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12816 
12817     return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12818                            : ByteProvider<SDValue>(
12819                                  ByteProvider<SDValue>::getConstantZero());
12820   }
12821 
12822   default: {
12823     return std::nullopt;
12824   }
12825   }
12826 
12827   llvm_unreachable("fully handled switch");
12828 }
12829 
12830 // Returns true if the Operand is a scalar and is 16 bits
12831 static bool isExtendedFrom16Bits(SDValue &Operand) {
12832 
12833   switch (Operand.getOpcode()) {
12834   case ISD::ANY_EXTEND:
12835   case ISD::SIGN_EXTEND:
12836   case ISD::ZERO_EXTEND: {
12837     auto OpVT = Operand.getOperand(0).getValueType();
12838     return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12839   }
12840   case ISD::LOAD: {
12841     LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12842     auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12843     if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12844         ExtType == ISD::EXTLOAD) {
12845       auto MemVT = L->getMemoryVT();
12846       return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12847     }
12848     return L->getMemoryVT().getSizeInBits() == 16;
12849   }
12850   default:
12851     return false;
12852   }
12853 }
12854 
12855 // Returns true if the mask matches consecutive bytes, and the first byte
12856 // begins at a power of 2 byte offset from 0th byte
12857 static bool addresses16Bits(int Mask) {
12858   int Low8 = Mask & 0xff;
12859   int Hi8 = (Mask & 0xff00) >> 8;
12860 
12861   assert(Low8 < 8 && Hi8 < 8);
12862   // Are the bytes contiguous in the order of increasing addresses.
12863   bool IsConsecutive = (Hi8 - Low8 == 1);
12864   // Is the first byte at location that is aligned for 16 bit instructions.
12865   // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12866   // In this case, we still need code to extract the 16 bit operand, so it
12867   // is better to use i8 v_perm
12868   bool Is16Aligned = !(Low8 % 2);
12869 
12870   return IsConsecutive && Is16Aligned;
12871 }
12872 
12873 // Do not lower into v_perm if the operands are actually 16 bit
12874 // and the selected bits (based on PermMask) correspond with two
12875 // easily addressable 16 bit operands.
12876 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12877                                 SDValue &OtherOp) {
12878   int Low16 = PermMask & 0xffff;
12879   int Hi16 = (PermMask & 0xffff0000) >> 16;
12880 
12881   auto TempOp = peekThroughBitcasts(Op);
12882   auto TempOtherOp = peekThroughBitcasts(OtherOp);
12883 
12884   auto OpIs16Bit =
12885       TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12886   if (!OpIs16Bit)
12887     return true;
12888 
12889   auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12890                         isExtendedFrom16Bits(TempOtherOp);
12891   if (!OtherOpIs16Bit)
12892     return true;
12893 
12894   // Do we cleanly address both
12895   return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12896 }
12897 
12898 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12899                                   unsigned DWordOffset) {
12900   SDValue Ret;
12901 
12902   auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12903   // ByteProvider must be at least 8 bits
12904   assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12905 
12906   if (TypeSize <= 32)
12907     return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12908 
12909   if (Src.getValueType().isVector()) {
12910     auto ScalarTySize = Src.getScalarValueSizeInBits();
12911     auto ScalarTy = Src.getValueType().getScalarType();
12912     if (ScalarTySize == 32) {
12913       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12914                          DAG.getConstant(DWordOffset, SL, MVT::i32));
12915     }
12916     if (ScalarTySize > 32) {
12917       Ret = DAG.getNode(
12918           ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12919           DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12920       auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12921       if (ShiftVal)
12922         Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12923                           DAG.getConstant(ShiftVal, SL, MVT::i32));
12924       return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12925     }
12926 
12927     assert(ScalarTySize < 32);
12928     auto NumElements = TypeSize / ScalarTySize;
12929     auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12930     auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12931     auto NumElementsIn32 = 32 / ScalarTySize;
12932     auto NumAvailElements = DWordOffset < Trunc32Elements
12933                                 ? NumElementsIn32
12934                                 : NumElements - NormalizedTrunc;
12935 
12936     SmallVector<SDValue, 4> VecSrcs;
12937     DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12938                               NumAvailElements);
12939 
12940     Ret = DAG.getBuildVector(
12941         MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12942         VecSrcs);
12943     return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12944   }
12945 
12946   /// Scalar Type
12947   auto ShiftVal = 32 * DWordOffset;
12948   Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12949                     DAG.getConstant(ShiftVal, SL, MVT::i32));
12950   return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12951 }
12952 
12953 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12954   SelectionDAG &DAG = DCI.DAG;
12955   [[maybe_unused]] EVT VT = N->getValueType(0);
12956   SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12957 
12958   // VT is known to be MVT::i32, so we need to provide 4 bytes.
12959   assert(VT == MVT::i32);
12960   for (int i = 0; i < 4; i++) {
12961     // Find the ByteProvider that provides the ith byte of the result of OR
12962     std::optional<ByteProvider<SDValue>> P =
12963         calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12964     // TODO support constantZero
12965     if (!P || P->isConstantZero())
12966       return SDValue();
12967 
12968     PermNodes.push_back(*P);
12969   }
12970   if (PermNodes.size() != 4)
12971     return SDValue();
12972 
12973   std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12974   std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12975   uint64_t PermMask = 0x00000000;
12976   for (size_t i = 0; i < PermNodes.size(); i++) {
12977     auto PermOp = PermNodes[i];
12978     // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12979     // by sizeof(Src2) = 4
12980     int SrcByteAdjust = 4;
12981 
12982     // If the Src uses a byte from a different DWORD, then it corresponds
12983     // with a difference source
12984     if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12985         ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12986       if (SecondSrc)
12987         if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12988             ((PermOp.SrcOffset / 4) != SecondSrc->second))
12989           return SDValue();
12990 
12991       // Set the index of the second distinct Src node
12992       SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12993       assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12994       SrcByteAdjust = 0;
12995     }
12996     assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12997     assert(!DAG.getDataLayout().isBigEndian());
12998     PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12999   }
13000   SDLoc DL(N);
13001   SDValue Op = *PermNodes[FirstSrc.first].Src;
13002   Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13003   assert(Op.getValueSizeInBits() == 32);
13004 
13005   // Check that we are not just extracting the bytes in order from an op
13006   if (!SecondSrc) {
13007     int Low16 = PermMask & 0xffff;
13008     int Hi16 = (PermMask & 0xffff0000) >> 16;
13009 
13010     bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13011     bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13012 
13013     // The perm op would really just produce Op. So combine into Op
13014     if (WellFormedLow && WellFormedHi)
13015       return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13016   }
13017 
13018   SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13019 
13020   if (SecondSrc) {
13021     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13022     assert(OtherOp.getValueSizeInBits() == 32);
13023   }
13024 
13025   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13026 
13027     assert(Op.getValueType().isByteSized() &&
13028            OtherOp.getValueType().isByteSized());
13029 
13030     // If the ultimate src is less than 32 bits, then we will only be
13031     // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13032     // CalculateByteProvider would not have returned Op as source if we
13033     // used a byte that is outside its ValueType. Thus, we are free to
13034     // ANY_EXTEND as the extended bits are dont-cares.
13035     Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13036     OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13037 
13038     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13039                        DAG.getConstant(PermMask, DL, MVT::i32));
13040   }
13041   return SDValue();
13042 }
13043 
13044 SDValue SITargetLowering::performOrCombine(SDNode *N,
13045                                            DAGCombinerInfo &DCI) const {
13046   SelectionDAG &DAG = DCI.DAG;
13047   SDValue LHS = N->getOperand(0);
13048   SDValue RHS = N->getOperand(1);
13049 
13050   EVT VT = N->getValueType(0);
13051   if (VT == MVT::i1) {
13052     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13053     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13054         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13055       SDValue Src = LHS.getOperand(0);
13056       if (Src != RHS.getOperand(0))
13057         return SDValue();
13058 
13059       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13060       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13061       if (!CLHS || !CRHS)
13062         return SDValue();
13063 
13064       // Only 10 bits are used.
13065       static const uint32_t MaxMask = 0x3ff;
13066 
13067       uint32_t NewMask =
13068           (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13069       SDLoc DL(N);
13070       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13071                          DAG.getConstant(NewMask, DL, MVT::i32));
13072     }
13073 
13074     return SDValue();
13075   }
13076 
13077   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13078   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
13079       LHS.getOpcode() == AMDGPUISD::PERM &&
13080       isa<ConstantSDNode>(LHS.getOperand(2))) {
13081     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13082     if (!Sel)
13083       return SDValue();
13084 
13085     Sel |= LHS.getConstantOperandVal(2);
13086     SDLoc DL(N);
13087     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13088                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13089   }
13090 
13091   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13092   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13093   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13094       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13095 
13096     // If all the uses of an or need to extract the individual elements, do not
13097     // attempt to lower into v_perm
13098     auto usesCombinedOperand = [](SDNode *OrUse) {
13099       // If we have any non-vectorized use, then it is a candidate for v_perm
13100       if (OrUse->getOpcode() != ISD::BITCAST ||
13101           !OrUse->getValueType(0).isVector())
13102         return true;
13103 
13104       // If we have any non-vectorized use, then it is a candidate for v_perm
13105       for (auto *VUser : OrUse->users()) {
13106         if (!VUser->getValueType(0).isVector())
13107           return true;
13108 
13109         // If the use of a vector is a store, then combining via a v_perm
13110         // is beneficial.
13111         // TODO -- whitelist more uses
13112         for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13113           if (VUser->getOpcode() == VectorwiseOp)
13114             return true;
13115       }
13116       return false;
13117     };
13118 
13119     if (!any_of(N->users(), usesCombinedOperand))
13120       return SDValue();
13121 
13122     uint32_t LHSMask = getPermuteMask(LHS);
13123     uint32_t RHSMask = getPermuteMask(RHS);
13124 
13125     if (LHSMask != ~0u && RHSMask != ~0u) {
13126       // Canonicalize the expression in an attempt to have fewer unique masks
13127       // and therefore fewer registers used to hold the masks.
13128       if (LHSMask > RHSMask) {
13129         std::swap(LHSMask, RHSMask);
13130         std::swap(LHS, RHS);
13131       }
13132 
13133       // Select 0xc for each lane used from source operand. Zero has 0xc mask
13134       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13135       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13136       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13137 
13138       // Check of we need to combine values from two sources within a byte.
13139       if (!(LHSUsedLanes & RHSUsedLanes) &&
13140           // If we select high and lower word keep it for SDWA.
13141           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13142           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13143         // Kill zero bytes selected by other mask. Zero value is 0xc.
13144         LHSMask &= ~RHSUsedLanes;
13145         RHSMask &= ~LHSUsedLanes;
13146         // Add 4 to each active LHS lane
13147         LHSMask |= LHSUsedLanes & 0x04040404;
13148         // Combine masks
13149         uint32_t Sel = LHSMask | RHSMask;
13150         SDLoc DL(N);
13151 
13152         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13153                            RHS.getOperand(0),
13154                            DAG.getConstant(Sel, DL, MVT::i32));
13155       }
13156     }
13157     if (LHSMask == ~0u || RHSMask == ~0u) {
13158       if (SDValue Perm = matchPERM(N, DCI))
13159         return Perm;
13160     }
13161   }
13162 
13163   if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13164     return SDValue();
13165 
13166   // TODO: This could be a generic combine with a predicate for extracting the
13167   // high half of an integer being free.
13168 
13169   // (or i64:x, (zero_extend i32:y)) ->
13170   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13171   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13172       RHS.getOpcode() != ISD::ZERO_EXTEND)
13173     std::swap(LHS, RHS);
13174 
13175   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13176     SDValue ExtSrc = RHS.getOperand(0);
13177     EVT SrcVT = ExtSrc.getValueType();
13178     if (SrcVT == MVT::i32) {
13179       SDLoc SL(N);
13180       auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13181       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13182 
13183       DCI.AddToWorklist(LowOr.getNode());
13184       DCI.AddToWorklist(HiBits.getNode());
13185 
13186       SDValue Vec =
13187           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13188       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13189     }
13190   }
13191 
13192   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13193   if (CRHS) {
13194     if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13195                                                  N->getOperand(0), CRHS))
13196       return Split;
13197   }
13198 
13199   return SDValue();
13200 }
13201 
13202 SDValue SITargetLowering::performXorCombine(SDNode *N,
13203                                             DAGCombinerInfo &DCI) const {
13204   if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13205     return RV;
13206 
13207   SDValue LHS = N->getOperand(0);
13208   SDValue RHS = N->getOperand(1);
13209 
13210   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13211   SelectionDAG &DAG = DCI.DAG;
13212 
13213   EVT VT = N->getValueType(0);
13214   if (CRHS && VT == MVT::i64) {
13215     if (SDValue Split =
13216             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13217       return Split;
13218   }
13219 
13220   // Make sure to apply the 64-bit constant splitting fold before trying to fold
13221   // fneg-like xors into 64-bit select.
13222   if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13223     // This looks like an fneg, try to fold as a source modifier.
13224     if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13225         shouldFoldFNegIntoSrc(N, LHS)) {
13226       // xor (select c, a, b), 0x80000000 ->
13227       //   bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13228       SDLoc DL(N);
13229       SDValue CastLHS =
13230           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13231       SDValue CastRHS =
13232           DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13233       SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13234       SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13235       SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13236                                       LHS->getOperand(0), FNegLHS, FNegRHS);
13237       return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13238     }
13239   }
13240 
13241   return SDValue();
13242 }
13243 
13244 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13245                                                    DAGCombinerInfo &DCI) const {
13246   if (!Subtarget->has16BitInsts() ||
13247       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13248     return SDValue();
13249 
13250   EVT VT = N->getValueType(0);
13251   if (VT != MVT::i32)
13252     return SDValue();
13253 
13254   SDValue Src = N->getOperand(0);
13255   if (Src.getValueType() != MVT::i16)
13256     return SDValue();
13257 
13258   return SDValue();
13259 }
13260 
13261 SDValue
13262 SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13263                                                 DAGCombinerInfo &DCI) const {
13264   SDValue Src = N->getOperand(0);
13265   auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13266 
13267   // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13268   // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13269   if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13270         VTSign->getVT() == MVT::i8) ||
13271        (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13272         VTSign->getVT() == MVT::i16))) {
13273     assert(Subtarget->hasScalarSubwordLoads() &&
13274            "s_buffer_load_{u8, i8} are supported "
13275            "in GFX12 (or newer) architectures.");
13276     EVT VT = Src.getValueType();
13277     unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13278                        ? AMDGPUISD::SBUFFER_LOAD_BYTE
13279                        : AMDGPUISD::SBUFFER_LOAD_SHORT;
13280     SDLoc DL(N);
13281     SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13282     SDValue Ops[] = {
13283         Src.getOperand(0), // source register
13284         Src.getOperand(1), // offset
13285         Src.getOperand(2)  // cachePolicy
13286     };
13287     auto *M = cast<MemSDNode>(Src);
13288     SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13289         Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13290     SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13291     return LoadVal;
13292   }
13293   if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13294         VTSign->getVT() == MVT::i8) ||
13295        (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13296         VTSign->getVT() == MVT::i16)) &&
13297       Src.hasOneUse()) {
13298     auto *M = cast<MemSDNode>(Src);
13299     SDValue Ops[] = {Src.getOperand(0), // Chain
13300                      Src.getOperand(1), // rsrc
13301                      Src.getOperand(2), // vindex
13302                      Src.getOperand(3), // voffset
13303                      Src.getOperand(4), // soffset
13304                      Src.getOperand(5), // offset
13305                      Src.getOperand(6), Src.getOperand(7)};
13306     // replace with BUFFER_LOAD_BYTE/SHORT
13307     SDVTList ResList =
13308         DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13309     unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13310                        ? AMDGPUISD::BUFFER_LOAD_BYTE
13311                        : AMDGPUISD::BUFFER_LOAD_SHORT;
13312     SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13313         Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13314     return DCI.DAG.getMergeValues(
13315         {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13316   }
13317   return SDValue();
13318 }
13319 
13320 SDValue SITargetLowering::performClassCombine(SDNode *N,
13321                                               DAGCombinerInfo &DCI) const {
13322   SelectionDAG &DAG = DCI.DAG;
13323   SDValue Mask = N->getOperand(1);
13324 
13325   // fp_class x, 0 -> false
13326   if (isNullConstant(Mask))
13327     return DAG.getConstant(0, SDLoc(N), MVT::i1);
13328 
13329   if (N->getOperand(0).isUndef())
13330     return DAG.getUNDEF(MVT::i1);
13331 
13332   return SDValue();
13333 }
13334 
13335 SDValue SITargetLowering::performRcpCombine(SDNode *N,
13336                                             DAGCombinerInfo &DCI) const {
13337   EVT VT = N->getValueType(0);
13338   SDValue N0 = N->getOperand(0);
13339 
13340   if (N0.isUndef()) {
13341     return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13342                                  SDLoc(N), VT);
13343   }
13344 
13345   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13346                          N0.getOpcode() == ISD::SINT_TO_FP)) {
13347     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13348                            N->getFlags());
13349   }
13350 
13351   // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13352   if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13353       N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13354     return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13355                            N->getFlags());
13356   }
13357 
13358   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
13359 }
13360 
13361 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
13362                                        unsigned MaxDepth) const {
13363   unsigned Opcode = Op.getOpcode();
13364   if (Opcode == ISD::FCANONICALIZE)
13365     return true;
13366 
13367   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13368     const auto &F = CFP->getValueAPF();
13369     if (F.isNaN() && F.isSignaling())
13370       return false;
13371     if (!F.isDenormal())
13372       return true;
13373 
13374     DenormalMode Mode =
13375         DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13376     return Mode == DenormalMode::getIEEE();
13377   }
13378 
13379   // If source is a result of another standard FP operation it is already in
13380   // canonical form.
13381   if (MaxDepth == 0)
13382     return false;
13383 
13384   switch (Opcode) {
13385   // These will flush denorms if required.
13386   case ISD::FADD:
13387   case ISD::FSUB:
13388   case ISD::FMUL:
13389   case ISD::FCEIL:
13390   case ISD::FFLOOR:
13391   case ISD::FMA:
13392   case ISD::FMAD:
13393   case ISD::FSQRT:
13394   case ISD::FDIV:
13395   case ISD::FREM:
13396   case ISD::FP_ROUND:
13397   case ISD::FP_EXTEND:
13398   case ISD::FP16_TO_FP:
13399   case ISD::FP_TO_FP16:
13400   case ISD::BF16_TO_FP:
13401   case ISD::FP_TO_BF16:
13402   case ISD::FLDEXP:
13403   case AMDGPUISD::FMUL_LEGACY:
13404   case AMDGPUISD::FMAD_FTZ:
13405   case AMDGPUISD::RCP:
13406   case AMDGPUISD::RSQ:
13407   case AMDGPUISD::RSQ_CLAMP:
13408   case AMDGPUISD::RCP_LEGACY:
13409   case AMDGPUISD::RCP_IFLAG:
13410   case AMDGPUISD::LOG:
13411   case AMDGPUISD::EXP:
13412   case AMDGPUISD::DIV_SCALE:
13413   case AMDGPUISD::DIV_FMAS:
13414   case AMDGPUISD::DIV_FIXUP:
13415   case AMDGPUISD::FRACT:
13416   case AMDGPUISD::CVT_PKRTZ_F16_F32:
13417   case AMDGPUISD::CVT_F32_UBYTE0:
13418   case AMDGPUISD::CVT_F32_UBYTE1:
13419   case AMDGPUISD::CVT_F32_UBYTE2:
13420   case AMDGPUISD::CVT_F32_UBYTE3:
13421   case AMDGPUISD::FP_TO_FP16:
13422   case AMDGPUISD::SIN_HW:
13423   case AMDGPUISD::COS_HW:
13424     return true;
13425 
13426   // It can/will be lowered or combined as a bit operation.
13427   // Need to check their input recursively to handle.
13428   case ISD::FNEG:
13429   case ISD::FABS:
13430   case ISD::FCOPYSIGN:
13431     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13432 
13433   case ISD::AND:
13434     if (Op.getValueType() == MVT::i32) {
13435       // Be careful as we only know it is a bitcast floating point type. It
13436       // could be f32, v2f16, we have no way of knowing. Luckily the constant
13437       // value that we optimize for, which comes up in fp32 to bf16 conversions,
13438       // is valid to optimize for all types.
13439       if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13440         if (RHS->getZExtValue() == 0xffff0000) {
13441           return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13442         }
13443       }
13444     }
13445     break;
13446 
13447   case ISD::FSIN:
13448   case ISD::FCOS:
13449   case ISD::FSINCOS:
13450     return Op.getValueType().getScalarType() != MVT::f16;
13451 
13452   case ISD::FMINNUM:
13453   case ISD::FMAXNUM:
13454   case ISD::FMINNUM_IEEE:
13455   case ISD::FMAXNUM_IEEE:
13456   case ISD::FMINIMUM:
13457   case ISD::FMAXIMUM:
13458   case ISD::FMINIMUMNUM:
13459   case ISD::FMAXIMUMNUM:
13460   case AMDGPUISD::CLAMP:
13461   case AMDGPUISD::FMED3:
13462   case AMDGPUISD::FMAX3:
13463   case AMDGPUISD::FMIN3:
13464   case AMDGPUISD::FMAXIMUM3:
13465   case AMDGPUISD::FMINIMUM3: {
13466     // FIXME: Shouldn't treat the generic operations different based these.
13467     // However, we aren't really required to flush the result from
13468     // minnum/maxnum..
13469 
13470     // snans will be quieted, so we only need to worry about denormals.
13471     if (Subtarget->supportsMinMaxDenormModes() ||
13472         // FIXME: denormalsEnabledForType is broken for dynamic
13473         denormalsEnabledForType(DAG, Op.getValueType()))
13474       return true;
13475 
13476     // Flushing may be required.
13477     // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13478     // targets need to check their input recursively.
13479 
13480     // FIXME: Does this apply with clamp? It's implemented with max.
13481     for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13482       if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13483         return false;
13484     }
13485 
13486     return true;
13487   }
13488   case ISD::SELECT: {
13489     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13490            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13491   }
13492   case ISD::BUILD_VECTOR: {
13493     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13494       SDValue SrcOp = Op.getOperand(i);
13495       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13496         return false;
13497     }
13498 
13499     return true;
13500   }
13501   case ISD::EXTRACT_VECTOR_ELT:
13502   case ISD::EXTRACT_SUBVECTOR: {
13503     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13504   }
13505   case ISD::INSERT_VECTOR_ELT: {
13506     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13507            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13508   }
13509   case ISD::UNDEF:
13510     // Could be anything.
13511     return false;
13512 
13513   case ISD::BITCAST:
13514     // TODO: This is incorrect as it loses track of the operand's type. We may
13515     // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13516     // same bits that are canonicalized in one type need not be in the other.
13517     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13518   case ISD::TRUNCATE: {
13519     // Hack round the mess we make when legalizing extract_vector_elt
13520     if (Op.getValueType() == MVT::i16) {
13521       SDValue TruncSrc = Op.getOperand(0);
13522       if (TruncSrc.getValueType() == MVT::i32 &&
13523           TruncSrc.getOpcode() == ISD::BITCAST &&
13524           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13525         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13526       }
13527     }
13528     return false;
13529   }
13530   case ISD::INTRINSIC_WO_CHAIN: {
13531     unsigned IntrinsicID = Op.getConstantOperandVal(0);
13532     // TODO: Handle more intrinsics
13533     switch (IntrinsicID) {
13534     case Intrinsic::amdgcn_cvt_pkrtz:
13535     case Intrinsic::amdgcn_cubeid:
13536     case Intrinsic::amdgcn_frexp_mant:
13537     case Intrinsic::amdgcn_fdot2:
13538     case Intrinsic::amdgcn_rcp:
13539     case Intrinsic::amdgcn_rsq:
13540     case Intrinsic::amdgcn_rsq_clamp:
13541     case Intrinsic::amdgcn_rcp_legacy:
13542     case Intrinsic::amdgcn_rsq_legacy:
13543     case Intrinsic::amdgcn_trig_preop:
13544     case Intrinsic::amdgcn_log:
13545     case Intrinsic::amdgcn_exp2:
13546     case Intrinsic::amdgcn_sqrt:
13547       return true;
13548     default:
13549       break;
13550     }
13551 
13552     break;
13553   }
13554   default:
13555     break;
13556   }
13557 
13558   // FIXME: denormalsEnabledForType is broken for dynamic
13559   return denormalsEnabledForType(DAG, Op.getValueType()) &&
13560          DAG.isKnownNeverSNaN(Op);
13561 }
13562 
13563 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
13564                                        unsigned MaxDepth) const {
13565   const MachineRegisterInfo &MRI = MF.getRegInfo();
13566   MachineInstr *MI = MRI.getVRegDef(Reg);
13567   unsigned Opcode = MI->getOpcode();
13568 
13569   if (Opcode == AMDGPU::G_FCANONICALIZE)
13570     return true;
13571 
13572   std::optional<FPValueAndVReg> FCR;
13573   // Constant splat (can be padded with undef) or scalar constant.
13574   if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
13575     if (FCR->Value.isSignaling())
13576       return false;
13577     if (!FCR->Value.isDenormal())
13578       return true;
13579 
13580     DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13581     return Mode == DenormalMode::getIEEE();
13582   }
13583 
13584   if (MaxDepth == 0)
13585     return false;
13586 
13587   switch (Opcode) {
13588   case AMDGPU::G_FADD:
13589   case AMDGPU::G_FSUB:
13590   case AMDGPU::G_FMUL:
13591   case AMDGPU::G_FCEIL:
13592   case AMDGPU::G_FFLOOR:
13593   case AMDGPU::G_FRINT:
13594   case AMDGPU::G_FNEARBYINT:
13595   case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13596   case AMDGPU::G_INTRINSIC_TRUNC:
13597   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13598   case AMDGPU::G_FMA:
13599   case AMDGPU::G_FMAD:
13600   case AMDGPU::G_FSQRT:
13601   case AMDGPU::G_FDIV:
13602   case AMDGPU::G_FREM:
13603   case AMDGPU::G_FPOW:
13604   case AMDGPU::G_FPEXT:
13605   case AMDGPU::G_FLOG:
13606   case AMDGPU::G_FLOG2:
13607   case AMDGPU::G_FLOG10:
13608   case AMDGPU::G_FPTRUNC:
13609   case AMDGPU::G_AMDGPU_RCP_IFLAG:
13610   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13611   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13612   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13613   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13614     return true;
13615   case AMDGPU::G_FNEG:
13616   case AMDGPU::G_FABS:
13617   case AMDGPU::G_FCOPYSIGN:
13618     return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
13619   case AMDGPU::G_FMINNUM:
13620   case AMDGPU::G_FMAXNUM:
13621   case AMDGPU::G_FMINNUM_IEEE:
13622   case AMDGPU::G_FMAXNUM_IEEE:
13623   case AMDGPU::G_FMINIMUM:
13624   case AMDGPU::G_FMAXIMUM:
13625   case AMDGPU::G_FMINIMUMNUM:
13626   case AMDGPU::G_FMAXIMUMNUM: {
13627     if (Subtarget->supportsMinMaxDenormModes() ||
13628         // FIXME: denormalsEnabledForType is broken for dynamic
13629         denormalsEnabledForType(MRI.getType(Reg), MF))
13630       return true;
13631 
13632     [[fallthrough]];
13633   }
13634   case AMDGPU::G_BUILD_VECTOR:
13635     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
13636       if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
13637         return false;
13638     return true;
13639   case AMDGPU::G_INTRINSIC:
13640   case AMDGPU::G_INTRINSIC_CONVERGENT:
13641     switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
13642     case Intrinsic::amdgcn_fmul_legacy:
13643     case Intrinsic::amdgcn_fmad_ftz:
13644     case Intrinsic::amdgcn_sqrt:
13645     case Intrinsic::amdgcn_fmed3:
13646     case Intrinsic::amdgcn_sin:
13647     case Intrinsic::amdgcn_cos:
13648     case Intrinsic::amdgcn_log:
13649     case Intrinsic::amdgcn_exp2:
13650     case Intrinsic::amdgcn_log_clamp:
13651     case Intrinsic::amdgcn_rcp:
13652     case Intrinsic::amdgcn_rcp_legacy:
13653     case Intrinsic::amdgcn_rsq:
13654     case Intrinsic::amdgcn_rsq_clamp:
13655     case Intrinsic::amdgcn_rsq_legacy:
13656     case Intrinsic::amdgcn_div_scale:
13657     case Intrinsic::amdgcn_div_fmas:
13658     case Intrinsic::amdgcn_div_fixup:
13659     case Intrinsic::amdgcn_fract:
13660     case Intrinsic::amdgcn_cvt_pkrtz:
13661     case Intrinsic::amdgcn_cubeid:
13662     case Intrinsic::amdgcn_cubema:
13663     case Intrinsic::amdgcn_cubesc:
13664     case Intrinsic::amdgcn_cubetc:
13665     case Intrinsic::amdgcn_frexp_mant:
13666     case Intrinsic::amdgcn_fdot2:
13667     case Intrinsic::amdgcn_trig_preop:
13668     case Intrinsic::amdgcn_tanh:
13669       return true;
13670     default:
13671       break;
13672     }
13673 
13674     [[fallthrough]];
13675   default:
13676     return false;
13677   }
13678 
13679   llvm_unreachable("invalid operation");
13680 }
13681 
13682 // Constant fold canonicalize.
13683 SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13684                                                  const SDLoc &SL, EVT VT,
13685                                                  const APFloat &C) const {
13686   // Flush denormals to 0 if not enabled.
13687   if (C.isDenormal()) {
13688     DenormalMode Mode =
13689         DAG.getMachineFunction().getDenormalMode(C.getSemantics());
13690     if (Mode == DenormalMode::getPreserveSign()) {
13691       return DAG.getConstantFP(
13692           APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
13693     }
13694 
13695     if (Mode != DenormalMode::getIEEE())
13696       return SDValue();
13697   }
13698 
13699   if (C.isNaN()) {
13700     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
13701     if (C.isSignaling()) {
13702       // Quiet a signaling NaN.
13703       // FIXME: Is this supposed to preserve payload bits?
13704       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13705     }
13706 
13707     // Make sure it is the canonical NaN bitpattern.
13708     //
13709     // TODO: Can we use -1 as the canonical NaN value since it's an inline
13710     // immediate?
13711     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13712       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
13713   }
13714 
13715   // Already canonical.
13716   return DAG.getConstantFP(C, SL, VT);
13717 }
13718 
13719 static bool vectorEltWillFoldAway(SDValue Op) {
13720   return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13721 }
13722 
13723 SDValue
13724 SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13725                                               DAGCombinerInfo &DCI) const {
13726   SelectionDAG &DAG = DCI.DAG;
13727   SDValue N0 = N->getOperand(0);
13728   EVT VT = N->getValueType(0);
13729 
13730   // fcanonicalize undef -> qnan
13731   if (N0.isUndef()) {
13732     APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics());
13733     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13734   }
13735 
13736   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13737     EVT VT = N->getValueType(0);
13738     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13739   }
13740 
13741   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13742   //                                                   (fcanonicalize k)
13743   //
13744   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13745 
13746   // TODO: This could be better with wider vectors that will be split to v2f16,
13747   // and to consider uses since there aren't that many packed operations.
13748   if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13749       isTypeLegal(MVT::v2f16)) {
13750     SDLoc SL(N);
13751     SDValue NewElts[2];
13752     SDValue Lo = N0.getOperand(0);
13753     SDValue Hi = N0.getOperand(1);
13754     EVT EltVT = Lo.getValueType();
13755 
13756     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
13757       for (unsigned I = 0; I != 2; ++I) {
13758         SDValue Op = N0.getOperand(I);
13759         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13760           NewElts[I] =
13761               getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
13762         } else if (Op.isUndef()) {
13763           // Handled below based on what the other operand is.
13764           NewElts[I] = Op;
13765         } else {
13766           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13767         }
13768       }
13769 
13770       // If one half is undef, and one is constant, prefer a splat vector rather
13771       // than the normal qNaN. If it's a register, prefer 0.0 since that's
13772       // cheaper to use and may be free with a packed operation.
13773       if (NewElts[0].isUndef()) {
13774         if (isa<ConstantFPSDNode>(NewElts[1]))
13775           NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
13776                            ? NewElts[1]
13777                            : DAG.getConstantFP(0.0f, SL, EltVT);
13778       }
13779 
13780       if (NewElts[1].isUndef()) {
13781         NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
13782                          ? NewElts[0]
13783                          : DAG.getConstantFP(0.0f, SL, EltVT);
13784       }
13785 
13786       return DAG.getBuildVector(VT, SL, NewElts);
13787     }
13788   }
13789 
13790   return SDValue();
13791 }
13792 
13793 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13794   switch (Opc) {
13795   case ISD::FMAXNUM:
13796   case ISD::FMAXNUM_IEEE:
13797   case ISD::FMAXIMUMNUM:
13798     return AMDGPUISD::FMAX3;
13799   case ISD::FMAXIMUM:
13800     return AMDGPUISD::FMAXIMUM3;
13801   case ISD::SMAX:
13802     return AMDGPUISD::SMAX3;
13803   case ISD::UMAX:
13804     return AMDGPUISD::UMAX3;
13805   case ISD::FMINNUM:
13806   case ISD::FMINNUM_IEEE:
13807   case ISD::FMINIMUMNUM:
13808     return AMDGPUISD::FMIN3;
13809   case ISD::FMINIMUM:
13810     return AMDGPUISD::FMINIMUM3;
13811   case ISD::SMIN:
13812     return AMDGPUISD::SMIN3;
13813   case ISD::UMIN:
13814     return AMDGPUISD::UMIN3;
13815   default:
13816     llvm_unreachable("Not a min/max opcode");
13817   }
13818 }
13819 
13820 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13821                                                    const SDLoc &SL, SDValue Src,
13822                                                    SDValue MinVal,
13823                                                    SDValue MaxVal,
13824                                                    bool Signed) const {
13825 
13826   // med3 comes from
13827   //    min(max(x, K0), K1), K0 < K1
13828   //    max(min(x, K0), K1), K1 < K0
13829   //
13830   // "MinVal" and "MaxVal" respectively refer to the rhs of the
13831   // min/max op.
13832   ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13833   ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13834 
13835   if (!MinK || !MaxK)
13836     return SDValue();
13837 
13838   if (Signed) {
13839     if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13840       return SDValue();
13841   } else {
13842     if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13843       return SDValue();
13844   }
13845 
13846   EVT VT = MinK->getValueType(0);
13847   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13848   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13849     return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13850 
13851   // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13852   // not available, but this is unlikely to be profitable as constants
13853   // will often need to be materialized & extended, especially on
13854   // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13855   return SDValue();
13856 }
13857 
13858 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13859   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13860     return C;
13861 
13862   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13863     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13864       return C;
13865   }
13866 
13867   return nullptr;
13868 }
13869 
13870 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13871                                                   const SDLoc &SL, SDValue Op0,
13872                                                   SDValue Op1) const {
13873   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
13874   if (!K1)
13875     return SDValue();
13876 
13877   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
13878   if (!K0)
13879     return SDValue();
13880 
13881   // Ordered >= (although NaN inputs should have folded away by now).
13882   if (K0->getValueAPF() > K1->getValueAPF())
13883     return SDValue();
13884 
13885   // med3 with a nan input acts like
13886   // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
13887   //
13888   // So the result depends on whether the IEEE mode bit is enabled or not with a
13889   // signaling nan input.
13890   // ieee=1
13891   // s0 snan: yields s2
13892   // s1 snan: yields s2
13893   // s2 snan: qnan
13894 
13895   // s0 qnan: min(s1, s2)
13896   // s1 qnan: min(s0, s2)
13897   // s2 qnan: min(s0, s1)
13898 
13899   // ieee=0
13900   // s0 snan: min(s1, s2)
13901   // s1 snan: min(s0, s2)
13902   // s2 snan: qnan
13903 
13904   // s0 qnan: min(s1, s2)
13905   // s1 qnan: min(s0, s2)
13906   // s2 qnan: min(s0, s1)
13907   const MachineFunction &MF = DAG.getMachineFunction();
13908   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13909 
13910   // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
13911   // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
13912   // can only form if op0 is fmaxnum_ieee if IEEE=1.
13913   EVT VT = Op0.getValueType();
13914   if (Info->getMode().DX10Clamp) {
13915     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13916     // hardware fmed3 behavior converting to a min.
13917     // FIXME: Should this be allowing -0.0?
13918     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13919       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13920   }
13921 
13922   // med3 for f16 is only available on gfx9+, and not available for v2f16.
13923   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13924     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13925     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13926     // then give the other result, which is different from med3 with a NaN
13927     // input.
13928     SDValue Var = Op0.getOperand(0);
13929     if (!DAG.isKnownNeverSNaN(Var))
13930       return SDValue();
13931 
13932     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13933 
13934     if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13935         (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13936       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
13937                          SDValue(K0, 0), SDValue(K1, 0));
13938     }
13939   }
13940 
13941   return SDValue();
13942 }
13943 
13944 /// \return true if the subtarget supports minimum3 and maximum3 with the given
13945 /// base min/max opcode \p Opc for type \p VT.
13946 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13947                              EVT VT) {
13948   switch (Opc) {
13949   case ISD::FMINNUM:
13950   case ISD::FMAXNUM:
13951   case ISD::FMINNUM_IEEE:
13952   case ISD::FMAXNUM_IEEE:
13953   case ISD::FMINIMUMNUM:
13954   case ISD::FMAXIMUMNUM:
13955   case AMDGPUISD::FMIN_LEGACY:
13956   case AMDGPUISD::FMAX_LEGACY:
13957     return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13958   case ISD::FMINIMUM:
13959   case ISD::FMAXIMUM:
13960     return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13961            (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
13962            (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
13963   case ISD::SMAX:
13964   case ISD::SMIN:
13965   case ISD::UMAX:
13966   case ISD::UMIN:
13967     return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13968   default:
13969     return false;
13970   }
13971 
13972   llvm_unreachable("not a min/max opcode");
13973 }
13974 
13975 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13976                                                DAGCombinerInfo &DCI) const {
13977   SelectionDAG &DAG = DCI.DAG;
13978 
13979   EVT VT = N->getValueType(0);
13980   unsigned Opc = N->getOpcode();
13981   SDValue Op0 = N->getOperand(0);
13982   SDValue Op1 = N->getOperand(1);
13983 
13984   // Only do this if the inner op has one use since this will just increases
13985   // register pressure for no benefit.
13986 
13987   if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13988     // max(max(a, b), c) -> max3(a, b, c)
13989     // min(min(a, b), c) -> min3(a, b, c)
13990     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13991       SDLoc DL(N);
13992       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
13993                          Op0.getOperand(0), Op0.getOperand(1), Op1);
13994     }
13995 
13996     // Try commuted.
13997     // max(a, max(b, c)) -> max3(a, b, c)
13998     // min(a, min(b, c)) -> min3(a, b, c)
13999     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14000       SDLoc DL(N);
14001       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14002                          Op0, Op1.getOperand(0), Op1.getOperand(1));
14003     }
14004   }
14005 
14006   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14007   // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14008   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14009     if (SDValue Med3 = performIntMed3ImmCombine(
14010             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14011       return Med3;
14012   }
14013   if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14014     if (SDValue Med3 = performIntMed3ImmCombine(
14015             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14016       return Med3;
14017   }
14018 
14019   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14020     if (SDValue Med3 = performIntMed3ImmCombine(
14021             DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14022       return Med3;
14023   }
14024   if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14025     if (SDValue Med3 = performIntMed3ImmCombine(
14026             DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14027       return Med3;
14028   }
14029 
14030   // if !is_snan(x):
14031   //   fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14032   //   fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14033   //   fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14034   //   fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14035   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14036        (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14037        (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14038        (Opc == AMDGPUISD::FMIN_LEGACY &&
14039         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14040       (VT == MVT::f32 || VT == MVT::f64 ||
14041        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14042        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14043       Op0.hasOneUse()) {
14044     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14045       return Res;
14046   }
14047 
14048   // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14049   // for some types, but at a higher cost since it's implemented with a 3
14050   // operand form.
14051   const SDNodeFlags Flags = N->getFlags();
14052   if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14053       !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14054     unsigned NewOpc =
14055         Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14056     return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14057   }
14058 
14059   return SDValue();
14060 }
14061 
14062 static bool isClampZeroToOne(SDValue A, SDValue B) {
14063   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
14064     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
14065       // FIXME: Should this be allowing -0.0?
14066       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14067              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14068     }
14069   }
14070 
14071   return false;
14072 }
14073 
14074 // FIXME: Should only worry about snans for version with chain.
14075 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14076                                               DAGCombinerInfo &DCI) const {
14077   EVT VT = N->getValueType(0);
14078   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14079   // NaNs. With a NaN input, the order of the operands may change the result.
14080 
14081   SelectionDAG &DAG = DCI.DAG;
14082   SDLoc SL(N);
14083 
14084   SDValue Src0 = N->getOperand(0);
14085   SDValue Src1 = N->getOperand(1);
14086   SDValue Src2 = N->getOperand(2);
14087 
14088   if (isClampZeroToOne(Src0, Src1)) {
14089     // const_a, const_b, x -> clamp is safe in all cases including signaling
14090     // nans.
14091     // FIXME: Should this be allowing -0.0?
14092     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14093   }
14094 
14095   const MachineFunction &MF = DAG.getMachineFunction();
14096   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14097 
14098   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14099   // handling no dx10-clamp?
14100   if (Info->getMode().DX10Clamp) {
14101     // If NaNs is clamped to 0, we are free to reorder the inputs.
14102 
14103     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14104       std::swap(Src0, Src1);
14105 
14106     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14107       std::swap(Src1, Src2);
14108 
14109     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14110       std::swap(Src0, Src1);
14111 
14112     if (isClampZeroToOne(Src1, Src2))
14113       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14114   }
14115 
14116   return SDValue();
14117 }
14118 
14119 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14120                                                  DAGCombinerInfo &DCI) const {
14121   SDValue Src0 = N->getOperand(0);
14122   SDValue Src1 = N->getOperand(1);
14123   if (Src0.isUndef() && Src1.isUndef())
14124     return DCI.DAG.getUNDEF(N->getValueType(0));
14125   return SDValue();
14126 }
14127 
14128 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14129 // expanded into a set of cmp/select instructions.
14130 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
14131                                                 unsigned NumElem,
14132                                                 bool IsDivergentIdx,
14133                                                 const GCNSubtarget *Subtarget) {
14134   if (UseDivergentRegisterIndexing)
14135     return false;
14136 
14137   unsigned VecSize = EltSize * NumElem;
14138 
14139   // Sub-dword vectors of size 2 dword or less have better implementation.
14140   if (VecSize <= 64 && EltSize < 32)
14141     return false;
14142 
14143   // Always expand the rest of sub-dword instructions, otherwise it will be
14144   // lowered via memory.
14145   if (EltSize < 32)
14146     return true;
14147 
14148   // Always do this if var-idx is divergent, otherwise it will become a loop.
14149   if (IsDivergentIdx)
14150     return true;
14151 
14152   // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14153   unsigned NumInsts = NumElem /* Number of compares */ +
14154                       ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14155 
14156   // On some architectures (GFX9) movrel is not available and it's better
14157   // to expand.
14158   if (Subtarget->useVGPRIndexMode())
14159     return NumInsts <= 16;
14160 
14161   // If movrel is available, use it instead of expanding for vector of 8
14162   // elements.
14163   if (Subtarget->hasMovrel())
14164     return NumInsts <= 15;
14165 
14166   return true;
14167 }
14168 
14169 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
14170   SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14171   if (isa<ConstantSDNode>(Idx))
14172     return false;
14173 
14174   SDValue Vec = N->getOperand(0);
14175   EVT VecVT = Vec.getValueType();
14176   EVT EltVT = VecVT.getVectorElementType();
14177   unsigned EltSize = EltVT.getSizeInBits();
14178   unsigned NumElem = VecVT.getVectorNumElements();
14179 
14180   return SITargetLowering::shouldExpandVectorDynExt(
14181       EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14182 }
14183 
14184 SDValue
14185 SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14186                                                  DAGCombinerInfo &DCI) const {
14187   SDValue Vec = N->getOperand(0);
14188   SelectionDAG &DAG = DCI.DAG;
14189 
14190   EVT VecVT = Vec.getValueType();
14191   EVT VecEltVT = VecVT.getVectorElementType();
14192   EVT ResVT = N->getValueType(0);
14193 
14194   unsigned VecSize = VecVT.getSizeInBits();
14195   unsigned VecEltSize = VecEltVT.getSizeInBits();
14196 
14197   if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14198       allUsesHaveSourceMods(N)) {
14199     SDLoc SL(N);
14200     SDValue Idx = N->getOperand(1);
14201     SDValue Elt =
14202         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14203     return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14204   }
14205 
14206   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14207   //    =>
14208   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14209   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14210   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14211   if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14212     SDLoc SL(N);
14213     SDValue Idx = N->getOperand(1);
14214     unsigned Opc = Vec.getOpcode();
14215 
14216     switch (Opc) {
14217     default:
14218       break;
14219       // TODO: Support other binary operations.
14220     case ISD::FADD:
14221     case ISD::FSUB:
14222     case ISD::FMUL:
14223     case ISD::ADD:
14224     case ISD::UMIN:
14225     case ISD::UMAX:
14226     case ISD::SMIN:
14227     case ISD::SMAX:
14228     case ISD::FMAXNUM:
14229     case ISD::FMINNUM:
14230     case ISD::FMAXNUM_IEEE:
14231     case ISD::FMINNUM_IEEE:
14232     case ISD::FMAXIMUM:
14233     case ISD::FMINIMUM: {
14234       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14235                                  Vec.getOperand(0), Idx);
14236       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14237                                  Vec.getOperand(1), Idx);
14238 
14239       DCI.AddToWorklist(Elt0.getNode());
14240       DCI.AddToWorklist(Elt1.getNode());
14241       return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14242     }
14243     }
14244   }
14245 
14246   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14247   if (shouldExpandVectorDynExt(N)) {
14248     SDLoc SL(N);
14249     SDValue Idx = N->getOperand(1);
14250     SDValue V;
14251     for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14252       SDValue IC = DAG.getVectorIdxConstant(I, SL);
14253       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14254       if (I == 0)
14255         V = Elt;
14256       else
14257         V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14258     }
14259     return V;
14260   }
14261 
14262   if (!DCI.isBeforeLegalize())
14263     return SDValue();
14264 
14265   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14266   // elements. This exposes more load reduction opportunities by replacing
14267   // multiple small extract_vector_elements with a single 32-bit extract.
14268   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14269   if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14270       VecSize > 32 && VecSize % 32 == 0 && Idx) {
14271     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14272 
14273     unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14274     unsigned EltIdx = BitIndex / 32;
14275     unsigned LeftoverBitIdx = BitIndex % 32;
14276     SDLoc SL(N);
14277 
14278     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14279     DCI.AddToWorklist(Cast.getNode());
14280 
14281     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14282                               DAG.getConstant(EltIdx, SL, MVT::i32));
14283     DCI.AddToWorklist(Elt.getNode());
14284     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14285                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14286     DCI.AddToWorklist(Srl.getNode());
14287 
14288     EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14289     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14290     DCI.AddToWorklist(Trunc.getNode());
14291 
14292     if (VecEltVT == ResVT) {
14293       return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14294     }
14295 
14296     assert(ResVT.isScalarInteger());
14297     return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14298   }
14299 
14300   return SDValue();
14301 }
14302 
14303 SDValue
14304 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14305                                                 DAGCombinerInfo &DCI) const {
14306   SDValue Vec = N->getOperand(0);
14307   SDValue Idx = N->getOperand(2);
14308   EVT VecVT = Vec.getValueType();
14309   EVT EltVT = VecVT.getVectorElementType();
14310 
14311   // INSERT_VECTOR_ELT (<n x e>, var-idx)
14312   // => BUILD_VECTOR n x select (e, const-idx)
14313   if (!shouldExpandVectorDynExt(N))
14314     return SDValue();
14315 
14316   SelectionDAG &DAG = DCI.DAG;
14317   SDLoc SL(N);
14318   SDValue Ins = N->getOperand(1);
14319   EVT IdxVT = Idx.getValueType();
14320 
14321   SmallVector<SDValue, 16> Ops;
14322   for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14323     SDValue IC = DAG.getConstant(I, SL, IdxVT);
14324     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14325     SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14326     Ops.push_back(V);
14327   }
14328 
14329   return DAG.getBuildVector(VecVT, SL, Ops);
14330 }
14331 
14332 /// Return the source of an fp_extend from f16 to f32, or a converted FP
14333 /// constant.
14334 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
14335   if (Src.getOpcode() == ISD::FP_EXTEND &&
14336       Src.getOperand(0).getValueType() == MVT::f16) {
14337     return Src.getOperand(0);
14338   }
14339 
14340   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14341     APFloat Val = CFP->getValueAPF();
14342     bool LosesInfo = true;
14343     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
14344     if (!LosesInfo)
14345       return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14346   }
14347 
14348   return SDValue();
14349 }
14350 
14351 SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14352                                                 DAGCombinerInfo &DCI) const {
14353   assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14354          "combine only useful on gfx8");
14355 
14356   SDValue TruncSrc = N->getOperand(0);
14357   EVT VT = N->getValueType(0);
14358   if (VT != MVT::f16)
14359     return SDValue();
14360 
14361   if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14362       TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14363     return SDValue();
14364 
14365   SelectionDAG &DAG = DCI.DAG;
14366   SDLoc SL(N);
14367 
14368   // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14369   // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14370   // casting back.
14371 
14372   // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14373   // fmin(fmax(a, b), fmax(fmin(a, b), c))
14374   SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14375   if (!A)
14376     return SDValue();
14377 
14378   SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14379   if (!B)
14380     return SDValue();
14381 
14382   SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14383   if (!C)
14384     return SDValue();
14385 
14386   // This changes signaling nan behavior. If an input is a signaling nan, it
14387   // would have been quieted by the fpext originally. We don't care because
14388   // these are unconstrained ops. If we needed to insert quieting canonicalizes
14389   // we would be worse off than just doing the promotion.
14390   SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14391   SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14392   SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14393   return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14394 }
14395 
14396 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14397                                           const SDNode *N0,
14398                                           const SDNode *N1) const {
14399   EVT VT = N0->getValueType(0);
14400 
14401   // Only do this if we are not trying to support denormals. v_mad_f32 does not
14402   // support denormals ever.
14403   if (((VT == MVT::f32 &&
14404         denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
14405        (VT == MVT::f16 && Subtarget->hasMadF16() &&
14406         denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
14407       isOperationLegal(ISD::FMAD, VT))
14408     return ISD::FMAD;
14409 
14410   const TargetOptions &Options = DAG.getTarget().Options;
14411   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14412        (N0->getFlags().hasAllowContract() &&
14413         N1->getFlags().hasAllowContract())) &&
14414       isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
14415     return ISD::FMA;
14416   }
14417 
14418   return 0;
14419 }
14420 
14421 // For a reassociatable opcode perform:
14422 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14423 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14424                                                SelectionDAG &DAG) const {
14425   EVT VT = N->getValueType(0);
14426   if (VT != MVT::i32 && VT != MVT::i64)
14427     return SDValue();
14428 
14429   if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
14430     return SDValue();
14431 
14432   unsigned Opc = N->getOpcode();
14433   SDValue Op0 = N->getOperand(0);
14434   SDValue Op1 = N->getOperand(1);
14435 
14436   if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14437     return SDValue();
14438 
14439   if (Op0->isDivergent())
14440     std::swap(Op0, Op1);
14441 
14442   if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14443     return SDValue();
14444 
14445   SDValue Op2 = Op1.getOperand(1);
14446   Op1 = Op1.getOperand(0);
14447   if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14448     return SDValue();
14449 
14450   if (Op1->isDivergent())
14451     std::swap(Op1, Op2);
14452 
14453   SDLoc SL(N);
14454   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
14455   return DAG.getNode(Opc, SL, VT, Add1, Op2);
14456 }
14457 
14458 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14459                            SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14460   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
14461   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
14462   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
14463   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
14464 }
14465 
14466 // Fold
14467 //     y = lshr i64 x, 32
14468 //     res = add (mul i64 y, Const), x   where "Const" is a 64-bit constant
14469 //     with Const.hi == -1
14470 // To
14471 //     res = mad_u64_u32 y.lo ,Const.lo, x.lo
14472 static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
14473                                  SDValue MulLHS, SDValue MulRHS,
14474                                  SDValue AddRHS) {
14475   if (MulRHS.getOpcode() == ISD::SRL)
14476     std::swap(MulLHS, MulRHS);
14477 
14478   if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14479     return SDValue();
14480 
14481   ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
14482   if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14483       MulLHS.getOperand(0) != AddRHS)
14484     return SDValue();
14485 
14486   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());
14487   if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
14488     return SDValue();
14489 
14490   SDValue ConstMul =
14491       DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14492   return getMad64_32(DAG, SL, MVT::i64,
14493                      DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14494                      DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14495 }
14496 
14497 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14498 // multiplies, if any.
14499 //
14500 // Full 64-bit multiplies that feed into an addition are lowered here instead
14501 // of using the generic expansion. The generic expansion ends up with
14502 // a tree of ADD nodes that prevents us from using the "add" part of the
14503 // MAD instruction. The expansion produced here results in a chain of ADDs
14504 // instead of a tree.
14505 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14506                                             DAGCombinerInfo &DCI) const {
14507   assert(N->getOpcode() == ISD::ADD);
14508 
14509   SelectionDAG &DAG = DCI.DAG;
14510   EVT VT = N->getValueType(0);
14511   SDLoc SL(N);
14512   SDValue LHS = N->getOperand(0);
14513   SDValue RHS = N->getOperand(1);
14514 
14515   if (VT.isVector())
14516     return SDValue();
14517 
14518   // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14519   // result in scalar registers for uniform values.
14520   if (!N->isDivergent() && Subtarget->hasSMulHi())
14521     return SDValue();
14522 
14523   unsigned NumBits = VT.getScalarSizeInBits();
14524   if (NumBits <= 32 || NumBits > 64)
14525     return SDValue();
14526 
14527   if (LHS.getOpcode() != ISD::MUL) {
14528     assert(RHS.getOpcode() == ISD::MUL);
14529     std::swap(LHS, RHS);
14530   }
14531 
14532   // Avoid the fold if it would unduly increase the number of multiplies due to
14533   // multiple uses, except on hardware with full-rate multiply-add (which is
14534   // part of full-rate 64-bit ops).
14535   if (!Subtarget->hasFullRate64Ops()) {
14536     unsigned NumUsers = 0;
14537     for (SDNode *User : LHS->users()) {
14538       // There is a use that does not feed into addition, so the multiply can't
14539       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14540       if (User->getOpcode() != ISD::ADD)
14541         return SDValue();
14542 
14543       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14544       // MUL + 3xADD + 3xADDC over 3xMAD.
14545       ++NumUsers;
14546       if (NumUsers >= 3)
14547         return SDValue();
14548     }
14549   }
14550 
14551   SDValue MulLHS = LHS.getOperand(0);
14552   SDValue MulRHS = LHS.getOperand(1);
14553   SDValue AddRHS = RHS;
14554 
14555   if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14556     return FoldedMAD;
14557 
14558   // Always check whether operands are small unsigned values, since that
14559   // knowledge is useful in more cases. Check for small signed values only if
14560   // doing so can unlock a shorter code sequence.
14561   bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14562   bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14563 
14564   bool MulSignedLo = false;
14565   if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14566     MulSignedLo =
14567         numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14568   }
14569 
14570   // The operands and final result all have the same number of bits. If
14571   // operands need to be extended, they can be extended with garbage. The
14572   // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14573   // truncated away in the end.
14574   if (VT != MVT::i64) {
14575     MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14576     MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14577     AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14578   }
14579 
14580   // The basic code generated is conceptually straightforward. Pseudo code:
14581   //
14582   //   accum = mad_64_32 lhs.lo, rhs.lo, accum
14583   //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14584   //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14585   //
14586   // The second and third lines are optional, depending on whether the factors
14587   // are {sign,zero}-extended or not.
14588   //
14589   // The actual DAG is noisier than the pseudo code, but only due to
14590   // instructions that disassemble values into low and high parts, and
14591   // assemble the final result.
14592   SDValue One = DAG.getConstant(1, SL, MVT::i32);
14593 
14594   auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14595   auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14596   SDValue Accum =
14597       getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14598 
14599   if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14600     auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14601 
14602     if (!MulLHSUnsigned32) {
14603       auto MulLHSHi =
14604           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14605       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14606       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14607     }
14608 
14609     if (!MulRHSUnsigned32) {
14610       auto MulRHSHi =
14611           DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
14612       SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
14613       AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14614     }
14615 
14616     Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
14617     Accum = DAG.getBitcast(MVT::i64, Accum);
14618   }
14619 
14620   if (VT != MVT::i64)
14621     Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
14622   return Accum;
14623 }
14624 
14625 SDValue
14626 SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14627                                                   DAGCombinerInfo &DCI) const {
14628   SDValue RHS = N->getOperand(1);
14629   auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
14630   if (!CRHS)
14631     return SDValue();
14632 
14633   // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14634   // common.
14635   uint64_t Val = CRHS->getZExtValue();
14636   if (countr_zero(Val) >= 32) {
14637     SelectionDAG &DAG = DCI.DAG;
14638     SDLoc SL(N);
14639     SDValue LHS = N->getOperand(0);
14640 
14641     // Avoid carry machinery if we know the low half of the add does not
14642     // contribute to the final result.
14643     //
14644     // add i64:x, K if computeTrailingZeros(K) >= 32
14645     //  => build_pair (add x.hi, K.hi), x.lo
14646 
14647     // Breaking the 64-bit add here with this strange constant is unlikely
14648     // to interfere with addressing mode patterns.
14649 
14650     SDValue Hi = getHiHalf64(LHS, DAG);
14651     SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
14652     SDValue AddHi =
14653         DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
14654 
14655     SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
14656     return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
14657   }
14658 
14659   return SDValue();
14660 }
14661 
14662 // Collect the ultimate src of each of the mul node's operands, and confirm
14663 // each operand is 8 bytes.
14664 static std::optional<ByteProvider<SDValue>>
14665 handleMulOperand(const SDValue &MulOperand) {
14666   auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
14667   if (!Byte0 || Byte0->isConstantZero()) {
14668     return std::nullopt;
14669   }
14670   auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
14671   if (Byte1 && !Byte1->isConstantZero()) {
14672     return std::nullopt;
14673   }
14674   return Byte0;
14675 }
14676 
14677 static unsigned addPermMasks(unsigned First, unsigned Second) {
14678   unsigned FirstCs = First & 0x0c0c0c0c;
14679   unsigned SecondCs = Second & 0x0c0c0c0c;
14680   unsigned FirstNoCs = First & ~0x0c0c0c0c;
14681   unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14682 
14683   assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14684   assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14685   assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14686   assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14687 
14688   return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14689 }
14690 
14691 struct DotSrc {
14692   SDValue SrcOp;
14693   int64_t PermMask;
14694   int64_t DWordOffset;
14695 };
14696 
14697 static void placeSources(ByteProvider<SDValue> &Src0,
14698                          ByteProvider<SDValue> &Src1,
14699                          SmallVectorImpl<DotSrc> &Src0s,
14700                          SmallVectorImpl<DotSrc> &Src1s, int Step) {
14701 
14702   assert(Src0.Src.has_value() && Src1.Src.has_value());
14703   // Src0s and Src1s are empty, just place arbitrarily.
14704   if (Step == 0) {
14705     Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14706                      Src0.SrcOffset / 4});
14707     Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14708                      Src1.SrcOffset / 4});
14709     return;
14710   }
14711 
14712   for (int BPI = 0; BPI < 2; BPI++) {
14713     std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14714     if (BPI == 1) {
14715       BPP = {Src1, Src0};
14716     }
14717     unsigned ZeroMask = 0x0c0c0c0c;
14718     unsigned FMask = 0xFF << (8 * (3 - Step));
14719 
14720     unsigned FirstMask =
14721         (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14722     unsigned SecondMask =
14723         (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14724     // Attempt to find Src vector which contains our SDValue, if so, add our
14725     // perm mask to the existing one. If we are unable to find a match for the
14726     // first SDValue, attempt to find match for the second.
14727     int FirstGroup = -1;
14728     for (int I = 0; I < 2; I++) {
14729       SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14730       auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14731         return IterElt.SrcOp == *BPP.first.Src &&
14732                (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14733       };
14734 
14735       auto *Match = llvm::find_if(Srcs, MatchesFirst);
14736       if (Match != Srcs.end()) {
14737         Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
14738         FirstGroup = I;
14739         break;
14740       }
14741     }
14742     if (FirstGroup != -1) {
14743       SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14744       auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14745         return IterElt.SrcOp == *BPP.second.Src &&
14746                (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14747       };
14748       auto *Match = llvm::find_if(Srcs, MatchesSecond);
14749       if (Match != Srcs.end()) {
14750         Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
14751       } else
14752         Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
14753       return;
14754     }
14755   }
14756 
14757   // If we have made it here, then we could not find a match in Src0s or Src1s
14758   // for either Src0 or Src1, so just place them arbitrarily.
14759 
14760   unsigned ZeroMask = 0x0c0c0c0c;
14761   unsigned FMask = 0xFF << (8 * (3 - Step));
14762 
14763   Src0s.push_back(
14764       {*Src0.Src,
14765        ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14766        Src0.SrcOffset / 4});
14767   Src1s.push_back(
14768       {*Src1.Src,
14769        ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14770        Src1.SrcOffset / 4});
14771 }
14772 
14773 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
14774                               SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14775                               bool IsAny) {
14776 
14777   // If we just have one source, just permute it accordingly.
14778   if (Srcs.size() == 1) {
14779     auto *Elt = Srcs.begin();
14780     auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
14781 
14782     // v_perm will produce the original value
14783     if (Elt->PermMask == 0x3020100)
14784       return EltOp;
14785 
14786     return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14787                        DAG.getConstant(Elt->PermMask, SL, MVT::i32));
14788   }
14789 
14790   auto *FirstElt = Srcs.begin();
14791   auto *SecondElt = std::next(FirstElt);
14792 
14793   SmallVector<SDValue, 2> Perms;
14794 
14795   // If we have multiple sources in the chain, combine them via perms (using
14796   // calculated perm mask) and Ors.
14797   while (true) {
14798     auto FirstMask = FirstElt->PermMask;
14799     auto SecondMask = SecondElt->PermMask;
14800 
14801     unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14802     unsigned FirstPlusFour = FirstMask | 0x04040404;
14803     // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14804     // original 0x0C.
14805     FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14806 
14807     auto PermMask = addPermMasks(FirstMask, SecondMask);
14808     auto FirstVal =
14809         getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14810     auto SecondVal =
14811         getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
14812 
14813     Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
14814                                 SecondVal,
14815                                 DAG.getConstant(PermMask, SL, MVT::i32)));
14816 
14817     FirstElt = std::next(SecondElt);
14818     if (FirstElt == Srcs.end())
14819       break;
14820 
14821     SecondElt = std::next(FirstElt);
14822     // If we only have a FirstElt, then just combine that into the cumulative
14823     // source node.
14824     if (SecondElt == Srcs.end()) {
14825       auto EltOp =
14826           getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14827 
14828       Perms.push_back(
14829           DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14830                       DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14831       break;
14832     }
14833   }
14834 
14835   assert(Perms.size() == 1 || Perms.size() == 2);
14836   return Perms.size() == 2
14837              ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14838              : Perms[0];
14839 }
14840 
14841 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14842   for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14843     EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14844     auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14845     EntryMask += ZeroMask;
14846   }
14847 }
14848 
14849 static bool isMul(const SDValue Op) {
14850   auto Opcode = Op.getOpcode();
14851 
14852   return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14853           Opcode == AMDGPUISD::MUL_I24);
14854 }
14855 
14856 static std::optional<bool>
14857 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
14858                        ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14859                        const SDValue &S1Op, const SelectionDAG &DAG) {
14860   // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14861   // of the dot4 is irrelevant.
14862   if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14863     return false;
14864 
14865   auto Known0 = DAG.computeKnownBits(S0Op, 0);
14866   bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14867   bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14868   auto Known1 = DAG.computeKnownBits(S1Op, 0);
14869   bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14870   bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14871 
14872   assert(!(S0IsUnsigned && S0IsSigned));
14873   assert(!(S1IsUnsigned && S1IsSigned));
14874 
14875   // There are 9 possible permutations of
14876   // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14877 
14878   // In two permutations, the sign bits are known to be the same for both Ops,
14879   // so simply return Signed / Unsigned corresponding to the MSB
14880 
14881   if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14882     return S0IsSigned;
14883 
14884   // In another two permutations, the sign bits are known to be opposite. In
14885   // this case return std::nullopt to indicate a bad match.
14886 
14887   if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14888     return std::nullopt;
14889 
14890   // In the remaining five permutations, we don't know the value of the sign
14891   // bit for at least one Op. Since we have a valid ByteProvider, we know that
14892   // the upper bits must be extension bits. Thus, the only ways for the sign
14893   // bit to be unknown is if it was sign extended from unknown value, or if it
14894   // was any extended. In either case, it is correct to use the signed
14895   // version of the signedness semantics of dot4
14896 
14897   // In two of such permutations, we known the sign bit is set for
14898   // one op, and the other is unknown. It is okay to used signed version of
14899   // dot4.
14900   if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14901       ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14902     return true;
14903 
14904   // In one such permutation, we don't know either of the sign bits. It is okay
14905   // to used the signed version of dot4.
14906   if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14907     return true;
14908 
14909   // In two of such permutations, we known the sign bit is unset for
14910   // one op, and the other is unknown. Return std::nullopt to indicate a
14911   // bad match.
14912   if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14913       ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14914     return std::nullopt;
14915 
14916   llvm_unreachable("Fully covered condition");
14917 }
14918 
14919 SDValue SITargetLowering::performAddCombine(SDNode *N,
14920                                             DAGCombinerInfo &DCI) const {
14921   SelectionDAG &DAG = DCI.DAG;
14922   EVT VT = N->getValueType(0);
14923   SDLoc SL(N);
14924   SDValue LHS = N->getOperand(0);
14925   SDValue RHS = N->getOperand(1);
14926 
14927   if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14928     if (Subtarget->hasMad64_32()) {
14929       if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14930         return Folded;
14931     }
14932   }
14933 
14934   if (SDValue V = reassociateScalarOps(N, DAG)) {
14935     return V;
14936   }
14937 
14938   if (VT == MVT::i64) {
14939     if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14940       return Folded;
14941   }
14942 
14943   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14944       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14945     SDValue TempNode(N, 0);
14946     std::optional<bool> IsSigned;
14947     SmallVector<DotSrc, 4> Src0s;
14948     SmallVector<DotSrc, 4> Src1s;
14949     SmallVector<SDValue, 4> Src2s;
14950 
14951     // Match the v_dot4 tree, while collecting src nodes.
14952     int ChainLength = 0;
14953     for (int I = 0; I < 4; I++) {
14954       auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14955       if (MulIdx == -1)
14956         break;
14957       auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14958       if (!Src0)
14959         break;
14960       auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14961       if (!Src1)
14962         break;
14963 
14964       auto IterIsSigned = checkDot4MulSignedness(
14965           TempNode->getOperand(MulIdx), *Src0, *Src1,
14966           TempNode->getOperand(MulIdx)->getOperand(0),
14967           TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14968       if (!IterIsSigned)
14969         break;
14970       if (!IsSigned)
14971         IsSigned = *IterIsSigned;
14972       if (*IterIsSigned != *IsSigned)
14973         break;
14974       placeSources(*Src0, *Src1, Src0s, Src1s, I);
14975       auto AddIdx = 1 - MulIdx;
14976       // Allow the special case where add (add (mul24, 0), mul24) became ->
14977       // add (mul24, mul24).
14978       if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14979         Src2s.push_back(TempNode->getOperand(AddIdx));
14980         auto Src0 =
14981             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14982         if (!Src0)
14983           break;
14984         auto Src1 =
14985             handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14986         if (!Src1)
14987           break;
14988         auto IterIsSigned = checkDot4MulSignedness(
14989             TempNode->getOperand(AddIdx), *Src0, *Src1,
14990             TempNode->getOperand(AddIdx)->getOperand(0),
14991             TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14992         if (!IterIsSigned)
14993           break;
14994         assert(IsSigned);
14995         if (*IterIsSigned != *IsSigned)
14996           break;
14997         placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14998         Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14999         ChainLength = I + 2;
15000         break;
15001       }
15002 
15003       TempNode = TempNode->getOperand(AddIdx);
15004       Src2s.push_back(TempNode);
15005       ChainLength = I + 1;
15006       if (TempNode->getNumOperands() < 2)
15007         break;
15008       LHS = TempNode->getOperand(0);
15009       RHS = TempNode->getOperand(1);
15010     }
15011 
15012     if (ChainLength < 2)
15013       return SDValue();
15014 
15015     // Masks were constructed with assumption that we would find a chain of
15016     // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15017     // 0x0c) so they do not affect dot calculation.
15018     if (ChainLength < 4) {
15019       fixMasks(Src0s, ChainLength);
15020       fixMasks(Src1s, ChainLength);
15021     }
15022 
15023     SDValue Src0, Src1;
15024 
15025     // If we are just using a single source for both, and have permuted the
15026     // bytes consistently, we can just use the sources without permuting
15027     // (commutation).
15028     bool UseOriginalSrc = false;
15029     if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15030         Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15031         Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15032         Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15033       SmallVector<unsigned, 4> SrcBytes;
15034       auto Src0Mask = Src0s.begin()->PermMask;
15035       SrcBytes.push_back(Src0Mask & 0xFF000000);
15036       bool UniqueEntries = true;
15037       for (auto I = 1; I < 4; I++) {
15038         auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15039 
15040         if (is_contained(SrcBytes, NextByte)) {
15041           UniqueEntries = false;
15042           break;
15043         }
15044         SrcBytes.push_back(NextByte);
15045       }
15046 
15047       if (UniqueEntries) {
15048         UseOriginalSrc = true;
15049 
15050         auto *FirstElt = Src0s.begin();
15051         auto FirstEltOp =
15052             getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15053 
15054         auto *SecondElt = Src1s.begin();
15055         auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15056                                               SecondElt->DWordOffset);
15057 
15058         Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15059                                              MVT::getIntegerVT(32));
15060         Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15061                                              MVT::getIntegerVT(32));
15062       }
15063     }
15064 
15065     if (!UseOriginalSrc) {
15066       Src0 = resolveSources(DAG, SL, Src0s, false, true);
15067       Src1 = resolveSources(DAG, SL, Src1s, false, true);
15068     }
15069 
15070     assert(IsSigned);
15071     SDValue Src2 =
15072         DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15073 
15074     SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15075                                                   : Intrinsic::amdgcn_udot4,
15076                                         SL, MVT::i64);
15077 
15078     assert(!VT.isVector());
15079     auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15080                            Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15081 
15082     return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15083   }
15084 
15085   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15086     return SDValue();
15087 
15088   // add x, zext (setcc) => uaddo_carry x, 0, setcc
15089   // add x, sext (setcc) => usubo_carry x, 0, setcc
15090   unsigned Opc = LHS.getOpcode();
15091   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
15092       Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
15093     std::swap(RHS, LHS);
15094 
15095   Opc = RHS.getOpcode();
15096   switch (Opc) {
15097   default:
15098     break;
15099   case ISD::ZERO_EXTEND:
15100   case ISD::SIGN_EXTEND:
15101   case ISD::ANY_EXTEND: {
15102     auto Cond = RHS.getOperand(0);
15103     // If this won't be a real VOPC output, we would still need to insert an
15104     // extra instruction anyway.
15105     if (!isBoolSGPR(Cond))
15106       break;
15107     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15108     SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15109     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
15110     return DAG.getNode(Opc, SL, VTList, Args);
15111   }
15112   case ISD::UADDO_CARRY: {
15113     // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15114     if (!isNullConstant(RHS.getOperand(1)))
15115       break;
15116     SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15117     return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15118   }
15119   }
15120   return SDValue();
15121 }
15122 
15123 SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15124                                                DAGCombinerInfo &DCI) const {
15125   SelectionDAG &DAG = DCI.DAG;
15126   SDLoc DL(N);
15127   SDValue N0 = N->getOperand(0);
15128   SDValue N1 = N->getOperand(1);
15129 
15130   if (N1.getOpcode() == ISD::ADD) {
15131     // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15132     //    y is not, and (add y, z) is used only once.
15133     // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15134     //    z is not, and (add y, z) is used only once.
15135     // The goal is to move constant offsets to the outermost ptradd, to create
15136     // more opportunities to fold offsets into memory instructions.
15137     // Together with the generic combines in DAGCombiner.cpp, this also
15138     // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15139     //
15140     // This transform is here instead of in the general DAGCombiner as it can
15141     // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15142     // AArch64's CPA.
15143     SDValue X = N0;
15144     SDValue Y = N1.getOperand(0);
15145     SDValue Z = N1.getOperand(1);
15146     if (N1.hasOneUse()) {
15147       bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15148       bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15149       if (ZIsConstant != YIsConstant) {
15150         // If both additions in the original were NUW, the new ones are as well.
15151         SDNodeFlags Flags =
15152             (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15153         if (YIsConstant)
15154           std::swap(Y, Z);
15155 
15156         SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
15157         DCI.AddToWorklist(Inner.getNode());
15158         return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
15159       }
15160     }
15161   }
15162 
15163   return SDValue();
15164 }
15165 
15166 SDValue SITargetLowering::performSubCombine(SDNode *N,
15167                                             DAGCombinerInfo &DCI) const {
15168   SelectionDAG &DAG = DCI.DAG;
15169   EVT VT = N->getValueType(0);
15170 
15171   if (VT == MVT::i64) {
15172     if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15173       return Folded;
15174   }
15175 
15176   if (VT != MVT::i32)
15177     return SDValue();
15178 
15179   SDLoc SL(N);
15180   SDValue LHS = N->getOperand(0);
15181   SDValue RHS = N->getOperand(1);
15182 
15183   // sub x, zext (setcc) => usubo_carry x, 0, setcc
15184   // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15185   unsigned Opc = RHS.getOpcode();
15186   switch (Opc) {
15187   default:
15188     break;
15189   case ISD::ZERO_EXTEND:
15190   case ISD::SIGN_EXTEND:
15191   case ISD::ANY_EXTEND: {
15192     auto Cond = RHS.getOperand(0);
15193     // If this won't be a real VOPC output, we would still need to insert an
15194     // extra instruction anyway.
15195     if (!isBoolSGPR(Cond))
15196       break;
15197     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15198     SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15199     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
15200     return DAG.getNode(Opc, SL, VTList, Args);
15201   }
15202   }
15203 
15204   if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15205     // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15206     if (!isNullConstant(LHS.getOperand(1)))
15207       return SDValue();
15208     SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15209     return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15210   }
15211   return SDValue();
15212 }
15213 
15214 SDValue
15215 SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15216                                                  DAGCombinerInfo &DCI) const {
15217 
15218   if (N->getValueType(0) != MVT::i32)
15219     return SDValue();
15220 
15221   if (!isNullConstant(N->getOperand(1)))
15222     return SDValue();
15223 
15224   SelectionDAG &DAG = DCI.DAG;
15225   SDValue LHS = N->getOperand(0);
15226 
15227   // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15228   // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15229   unsigned LHSOpc = LHS.getOpcode();
15230   unsigned Opc = N->getOpcode();
15231   if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15232       (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15233     SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15234     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15235   }
15236   return SDValue();
15237 }
15238 
15239 SDValue SITargetLowering::performFAddCombine(SDNode *N,
15240                                              DAGCombinerInfo &DCI) const {
15241   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15242     return SDValue();
15243 
15244   SelectionDAG &DAG = DCI.DAG;
15245   EVT VT = N->getValueType(0);
15246 
15247   SDLoc SL(N);
15248   SDValue LHS = N->getOperand(0);
15249   SDValue RHS = N->getOperand(1);
15250 
15251   // These should really be instruction patterns, but writing patterns with
15252   // source modifiers is a pain.
15253 
15254   // fadd (fadd (a, a), b) -> mad 2.0, a, b
15255   if (LHS.getOpcode() == ISD::FADD) {
15256     SDValue A = LHS.getOperand(0);
15257     if (A == LHS.getOperand(1)) {
15258       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15259       if (FusedOp != 0) {
15260         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15261         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15262       }
15263     }
15264   }
15265 
15266   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15267   if (RHS.getOpcode() == ISD::FADD) {
15268     SDValue A = RHS.getOperand(0);
15269     if (A == RHS.getOperand(1)) {
15270       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15271       if (FusedOp != 0) {
15272         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15273         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15274       }
15275     }
15276   }
15277 
15278   return SDValue();
15279 }
15280 
15281 SDValue SITargetLowering::performFSubCombine(SDNode *N,
15282                                              DAGCombinerInfo &DCI) const {
15283   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15284     return SDValue();
15285 
15286   SelectionDAG &DAG = DCI.DAG;
15287   SDLoc SL(N);
15288   EVT VT = N->getValueType(0);
15289   assert(!VT.isVector());
15290 
15291   // Try to get the fneg to fold into the source modifier. This undoes generic
15292   // DAG combines and folds them into the mad.
15293   //
15294   // Only do this if we are not trying to support denormals. v_mad_f32 does
15295   // not support denormals ever.
15296   SDValue LHS = N->getOperand(0);
15297   SDValue RHS = N->getOperand(1);
15298   if (LHS.getOpcode() == ISD::FADD) {
15299     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15300     SDValue A = LHS.getOperand(0);
15301     if (A == LHS.getOperand(1)) {
15302       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15303       if (FusedOp != 0) {
15304         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15305         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15306 
15307         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15308       }
15309     }
15310   }
15311 
15312   if (RHS.getOpcode() == ISD::FADD) {
15313     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15314 
15315     SDValue A = RHS.getOperand(0);
15316     if (A == RHS.getOperand(1)) {
15317       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15318       if (FusedOp != 0) {
15319         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15320         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15321       }
15322     }
15323   }
15324 
15325   return SDValue();
15326 }
15327 
15328 SDValue SITargetLowering::performFDivCombine(SDNode *N,
15329                                              DAGCombinerInfo &DCI) const {
15330   SelectionDAG &DAG = DCI.DAG;
15331   SDLoc SL(N);
15332   EVT VT = N->getValueType(0);
15333   if (VT != MVT::f16 || !Subtarget->has16BitInsts())
15334     return SDValue();
15335 
15336   SDValue LHS = N->getOperand(0);
15337   SDValue RHS = N->getOperand(1);
15338 
15339   SDNodeFlags Flags = N->getFlags();
15340   SDNodeFlags RHSFlags = RHS->getFlags();
15341   if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15342       !RHS->hasOneUse())
15343     return SDValue();
15344 
15345   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
15346     bool IsNegative = false;
15347     if (CLHS->isExactlyValue(1.0) ||
15348         (IsNegative = CLHS->isExactlyValue(-1.0))) {
15349       // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15350       // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15351       if (RHS.getOpcode() == ISD::FSQRT) {
15352         // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15353         SDValue Rsq =
15354             DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
15355         return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15356       }
15357     }
15358   }
15359 
15360   return SDValue();
15361 }
15362 
15363 SDValue SITargetLowering::performFMulCombine(SDNode *N,
15364                                              DAGCombinerInfo &DCI) const {
15365   SelectionDAG &DAG = DCI.DAG;
15366   EVT VT = N->getValueType(0);
15367   EVT ScalarVT = VT.getScalarType();
15368   EVT IntVT = VT.changeElementType(MVT::i32);
15369 
15370   if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15371       (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15372     // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15373     return SDValue();
15374   }
15375 
15376   SDValue LHS = N->getOperand(0);
15377   SDValue RHS = N->getOperand(1);
15378 
15379   // It is cheaper to realize i32 inline constants as compared against
15380   // materializing f16 or f64 (or even non-inline f32) values,
15381   // possible via ldexp usage, as shown below :
15382   //
15383   // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
15384   // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
15385   // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
15386   if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15387       (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15388     const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
15389     if (!TrueNode)
15390       return SDValue();
15391     const ConstantFPSDNode *FalseNode =
15392         isConstOrConstSplatFP(RHS.getOperand(2));
15393     if (!FalseNode)
15394       return SDValue();
15395 
15396     if (TrueNode->isNegative() != FalseNode->isNegative())
15397       return SDValue();
15398 
15399     // For f32, only non-inline constants should be transformed.
15400     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15401     if (ScalarVT == MVT::f32 &&
15402         TII->isInlineConstant(TrueNode->getValueAPF()) &&
15403         TII->isInlineConstant(FalseNode->getValueAPF()))
15404       return SDValue();
15405 
15406     int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15407     if (TrueNodeExpVal == INT_MIN)
15408       return SDValue();
15409     int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15410     if (FalseNodeExpVal == INT_MIN)
15411       return SDValue();
15412 
15413     SDLoc SL(N);
15414     SDValue SelectNode =
15415         DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
15416                     DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
15417                     DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
15418 
15419     LHS = TrueNode->isNegative()
15420               ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
15421               : LHS;
15422 
15423     return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
15424   }
15425 
15426   return SDValue();
15427 }
15428 
15429 SDValue SITargetLowering::performFMACombine(SDNode *N,
15430                                             DAGCombinerInfo &DCI) const {
15431   SelectionDAG &DAG = DCI.DAG;
15432   EVT VT = N->getValueType(0);
15433   SDLoc SL(N);
15434 
15435   if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15436     return SDValue();
15437 
15438   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15439   //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15440   SDValue Op1 = N->getOperand(0);
15441   SDValue Op2 = N->getOperand(1);
15442   SDValue FMA = N->getOperand(2);
15443 
15444   if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15445       Op2.getOpcode() != ISD::FP_EXTEND)
15446     return SDValue();
15447 
15448   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15449   // regardless of the denorm mode setting. Therefore,
15450   // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
15451   const TargetOptions &Options = DAG.getTarget().Options;
15452   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
15453       (N->getFlags().hasAllowContract() &&
15454        FMA->getFlags().hasAllowContract())) {
15455     Op1 = Op1.getOperand(0);
15456     Op2 = Op2.getOperand(0);
15457     if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15458         Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15459       return SDValue();
15460 
15461     SDValue Vec1 = Op1.getOperand(0);
15462     SDValue Idx1 = Op1.getOperand(1);
15463     SDValue Vec2 = Op2.getOperand(0);
15464 
15465     SDValue FMAOp1 = FMA.getOperand(0);
15466     SDValue FMAOp2 = FMA.getOperand(1);
15467     SDValue FMAAcc = FMA.getOperand(2);
15468 
15469     if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15470         FMAOp2.getOpcode() != ISD::FP_EXTEND)
15471       return SDValue();
15472 
15473     FMAOp1 = FMAOp1.getOperand(0);
15474     FMAOp2 = FMAOp2.getOperand(0);
15475     if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15476         FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15477       return SDValue();
15478 
15479     SDValue Vec3 = FMAOp1.getOperand(0);
15480     SDValue Vec4 = FMAOp2.getOperand(0);
15481     SDValue Idx2 = FMAOp1.getOperand(1);
15482 
15483     if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
15484         // Idx1 and Idx2 cannot be the same.
15485         Idx1 == Idx2)
15486       return SDValue();
15487 
15488     if (Vec1 == Vec2 || Vec3 == Vec4)
15489       return SDValue();
15490 
15491     if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15492       return SDValue();
15493 
15494     if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15495       return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
15496                          DAG.getTargetConstant(0, SL, MVT::i1));
15497     }
15498   }
15499   return SDValue();
15500 }
15501 
15502 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15503                                               DAGCombinerInfo &DCI) const {
15504   SelectionDAG &DAG = DCI.DAG;
15505   SDLoc SL(N);
15506 
15507   SDValue LHS = N->getOperand(0);
15508   SDValue RHS = N->getOperand(1);
15509   EVT VT = LHS.getValueType();
15510   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15511 
15512   auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15513   if (!CRHS) {
15514     CRHS = dyn_cast<ConstantSDNode>(LHS);
15515     if (CRHS) {
15516       std::swap(LHS, RHS);
15517       CC = getSetCCSwappedOperands(CC);
15518     }
15519   }
15520 
15521   if (CRHS) {
15522     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15523         isBoolSGPR(LHS.getOperand(0))) {
15524       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15525       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15526       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
15527       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
15528       if ((CRHS->isAllOnes() &&
15529            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
15530           (CRHS->isZero() &&
15531            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
15532         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15533                            DAG.getAllOnesConstant(SL, MVT::i1));
15534       if ((CRHS->isAllOnes() &&
15535            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
15536           (CRHS->isZero() &&
15537            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
15538         return LHS.getOperand(0);
15539     }
15540 
15541     const APInt &CRHSVal = CRHS->getAPIntValue();
15542     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15543         LHS.getOpcode() == ISD::SELECT &&
15544         isa<ConstantSDNode>(LHS.getOperand(1)) &&
15545         isa<ConstantSDNode>(LHS.getOperand(2)) &&
15546         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
15547         isBoolSGPR(LHS.getOperand(0))) {
15548       // Given CT != FT:
15549       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15550       // setcc (select cc, CT, CF), CF, ne => cc
15551       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15552       // setcc (select cc, CT, CF), CT, eq => cc
15553       const APInt &CT = LHS.getConstantOperandAPInt(1);
15554       const APInt &CF = LHS.getConstantOperandAPInt(2);
15555 
15556       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15557           (CT == CRHSVal && CC == ISD::SETNE))
15558         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
15559                            DAG.getAllOnesConstant(SL, MVT::i1));
15560       if ((CF == CRHSVal && CC == ISD::SETNE) ||
15561           (CT == CRHSVal && CC == ISD::SETEQ))
15562         return LHS.getOperand(0);
15563     }
15564   }
15565 
15566   if (VT != MVT::f32 && VT != MVT::f64 &&
15567       (!Subtarget->has16BitInsts() || VT != MVT::f16))
15568     return SDValue();
15569 
15570   // Match isinf/isfinite pattern
15571   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15572   // (fcmp one (fabs x), inf) -> (fp_class x,
15573   // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15574   if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15575       LHS.getOpcode() == ISD::FABS) {
15576     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
15577     if (!CRHS)
15578       return SDValue();
15579 
15580     const APFloat &APF = CRHS->getValueAPF();
15581     if (APF.isInfinity() && !APF.isNegative()) {
15582       const unsigned IsInfMask =
15583           SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
15584       const unsigned IsFiniteMask =
15585           SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
15586           SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
15587           SIInstrFlags::P_SUBNORMAL;
15588       unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15589       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
15590                          DAG.getConstant(Mask, SL, MVT::i32));
15591     }
15592   }
15593 
15594   return SDValue();
15595 }
15596 
15597 SDValue
15598 SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15599                                              DAGCombinerInfo &DCI) const {
15600   SelectionDAG &DAG = DCI.DAG;
15601   SDLoc SL(N);
15602   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15603 
15604   SDValue Src = N->getOperand(0);
15605   SDValue Shift = N->getOperand(0);
15606 
15607   // TODO: Extend type shouldn't matter (assuming legal types).
15608   if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15609     Shift = Shift.getOperand(0);
15610 
15611   if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15612     // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
15613     // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15614     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15615     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15616     // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
15617     if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
15618       SDValue Shifted = DAG.getZExtOrTrunc(
15619           Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
15620 
15621       unsigned ShiftOffset = 8 * Offset;
15622       if (Shift.getOpcode() == ISD::SHL)
15623         ShiftOffset -= C->getZExtValue();
15624       else
15625         ShiftOffset += C->getZExtValue();
15626 
15627       if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15628         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
15629                            MVT::f32, Shifted);
15630       }
15631     }
15632   }
15633 
15634   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15635   APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
15636   if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
15637     // We simplified Src. If this node is not dead, visit it again so it is
15638     // folded properly.
15639     if (N->getOpcode() != ISD::DELETED_NODE)
15640       DCI.AddToWorklist(N);
15641     return SDValue(N, 0);
15642   }
15643 
15644   // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15645   if (SDValue DemandedSrc =
15646           TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
15647     return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
15648 
15649   return SDValue();
15650 }
15651 
15652 SDValue SITargetLowering::performClampCombine(SDNode *N,
15653                                               DAGCombinerInfo &DCI) const {
15654   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
15655   if (!CSrc)
15656     return SDValue();
15657 
15658   const MachineFunction &MF = DCI.DAG.getMachineFunction();
15659   const APFloat &F = CSrc->getValueAPF();
15660   APFloat Zero = APFloat::getZero(F.getSemantics());
15661   if (F < Zero ||
15662       (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15663     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
15664   }
15665 
15666   APFloat One(F.getSemantics(), "1.0");
15667   if (F > One)
15668     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
15669 
15670   return SDValue(CSrc, 0);
15671 }
15672 
15673 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
15674                                             DAGCombinerInfo &DCI) const {
15675   switch (N->getOpcode()) {
15676   case ISD::ADD:
15677   case ISD::SUB:
15678   case ISD::SHL:
15679   case ISD::SRL:
15680   case ISD::SRA:
15681   case ISD::AND:
15682   case ISD::OR:
15683   case ISD::XOR:
15684   case ISD::MUL:
15685   case ISD::SETCC:
15686   case ISD::SELECT:
15687   case ISD::SMIN:
15688   case ISD::SMAX:
15689   case ISD::UMIN:
15690   case ISD::UMAX:
15691     if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
15692       return Res;
15693     break;
15694   default:
15695     break;
15696   }
15697 
15698   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15699     return SDValue();
15700 
15701   switch (N->getOpcode()) {
15702   case ISD::ADD:
15703     return performAddCombine(N, DCI);
15704   case ISD::PTRADD:
15705     return performPtrAddCombine(N, DCI);
15706   case ISD::SUB:
15707     return performSubCombine(N, DCI);
15708   case ISD::UADDO_CARRY:
15709   case ISD::USUBO_CARRY:
15710     return performAddCarrySubCarryCombine(N, DCI);
15711   case ISD::FADD:
15712     return performFAddCombine(N, DCI);
15713   case ISD::FSUB:
15714     return performFSubCombine(N, DCI);
15715   case ISD::FDIV:
15716     return performFDivCombine(N, DCI);
15717   case ISD::FMUL:
15718     return performFMulCombine(N, DCI);
15719   case ISD::SETCC:
15720     return performSetCCCombine(N, DCI);
15721   case ISD::FMAXNUM:
15722   case ISD::FMINNUM:
15723   case ISD::FMAXNUM_IEEE:
15724   case ISD::FMINNUM_IEEE:
15725   case ISD::FMAXIMUM:
15726   case ISD::FMINIMUM:
15727   case ISD::FMAXIMUMNUM:
15728   case ISD::FMINIMUMNUM:
15729   case ISD::SMAX:
15730   case ISD::SMIN:
15731   case ISD::UMAX:
15732   case ISD::UMIN:
15733   case AMDGPUISD::FMIN_LEGACY:
15734   case AMDGPUISD::FMAX_LEGACY:
15735     return performMinMaxCombine(N, DCI);
15736   case ISD::FMA:
15737     return performFMACombine(N, DCI);
15738   case ISD::AND:
15739     return performAndCombine(N, DCI);
15740   case ISD::OR:
15741     return performOrCombine(N, DCI);
15742   case ISD::FSHR: {
15743     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15744     if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
15745         TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15746       return matchPERM(N, DCI);
15747     }
15748     break;
15749   }
15750   case ISD::XOR:
15751     return performXorCombine(N, DCI);
15752   case ISD::ZERO_EXTEND:
15753     return performZeroExtendCombine(N, DCI);
15754   case ISD::SIGN_EXTEND_INREG:
15755     return performSignExtendInRegCombine(N, DCI);
15756   case AMDGPUISD::FP_CLASS:
15757     return performClassCombine(N, DCI);
15758   case ISD::FCANONICALIZE:
15759     return performFCanonicalizeCombine(N, DCI);
15760   case AMDGPUISD::RCP:
15761     return performRcpCombine(N, DCI);
15762   case ISD::FLDEXP:
15763   case AMDGPUISD::FRACT:
15764   case AMDGPUISD::RSQ:
15765   case AMDGPUISD::RCP_LEGACY:
15766   case AMDGPUISD::RCP_IFLAG:
15767   case AMDGPUISD::RSQ_CLAMP: {
15768     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15769     SDValue Src = N->getOperand(0);
15770     if (Src.isUndef())
15771       return Src;
15772     break;
15773   }
15774   case ISD::SINT_TO_FP:
15775   case ISD::UINT_TO_FP:
15776     return performUCharToFloatCombine(N, DCI);
15777   case ISD::FCOPYSIGN:
15778     return performFCopySignCombine(N, DCI);
15779   case AMDGPUISD::CVT_F32_UBYTE0:
15780   case AMDGPUISD::CVT_F32_UBYTE1:
15781   case AMDGPUISD::CVT_F32_UBYTE2:
15782   case AMDGPUISD::CVT_F32_UBYTE3:
15783     return performCvtF32UByteNCombine(N, DCI);
15784   case AMDGPUISD::FMED3:
15785     return performFMed3Combine(N, DCI);
15786   case AMDGPUISD::CVT_PKRTZ_F16_F32:
15787     return performCvtPkRTZCombine(N, DCI);
15788   case AMDGPUISD::CLAMP:
15789     return performClampCombine(N, DCI);
15790   case ISD::SCALAR_TO_VECTOR: {
15791     SelectionDAG &DAG = DCI.DAG;
15792     EVT VT = N->getValueType(0);
15793 
15794     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15795     if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15796       SDLoc SL(N);
15797       SDValue Src = N->getOperand(0);
15798       EVT EltVT = Src.getValueType();
15799       if (EltVT != MVT::i16)
15800         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
15801 
15802       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
15803       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
15804     }
15805 
15806     break;
15807   }
15808   case ISD::EXTRACT_VECTOR_ELT:
15809     return performExtractVectorEltCombine(N, DCI);
15810   case ISD::INSERT_VECTOR_ELT:
15811     return performInsertVectorEltCombine(N, DCI);
15812   case ISD::FP_ROUND:
15813     return performFPRoundCombine(N, DCI);
15814   case ISD::LOAD: {
15815     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
15816       return Widened;
15817     [[fallthrough]];
15818   }
15819   default: {
15820     if (!DCI.isBeforeLegalize()) {
15821       if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
15822         return performMemSDNodeCombine(MemNode, DCI);
15823     }
15824 
15825     break;
15826   }
15827   }
15828 
15829   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
15830 }
15831 
15832 /// Helper function for adjustWritemask
15833 static unsigned SubIdx2Lane(unsigned Idx) {
15834   switch (Idx) {
15835   default:
15836     return ~0u;
15837   case AMDGPU::sub0:
15838     return 0;
15839   case AMDGPU::sub1:
15840     return 1;
15841   case AMDGPU::sub2:
15842     return 2;
15843   case AMDGPU::sub3:
15844     return 3;
15845   case AMDGPU::sub4:
15846     return 4; // Possible with TFE/LWE
15847   }
15848 }
15849 
15850 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15851 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15852                                           SelectionDAG &DAG) const {
15853   unsigned Opcode = Node->getMachineOpcode();
15854 
15855   // Subtract 1 because the vdata output is not a MachineSDNode operand.
15856   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
15857   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
15858     return Node; // not implemented for D16
15859 
15860   SDNode *Users[5] = {nullptr};
15861   unsigned Lane = 0;
15862   unsigned DmaskIdx =
15863       AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
15864   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
15865   unsigned NewDmask = 0;
15866   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
15867   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
15868   bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
15869                  (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
15870   unsigned TFCLane = 0;
15871   bool HasChain = Node->getNumValues() > 1;
15872 
15873   if (OldDmask == 0) {
15874     // These are folded out, but on the chance it happens don't assert.
15875     return Node;
15876   }
15877 
15878   unsigned OldBitsSet = llvm::popcount(OldDmask);
15879   // Work out which is the TFE/LWE lane if that is enabled.
15880   if (UsesTFC) {
15881     TFCLane = OldBitsSet;
15882   }
15883 
15884   // Try to figure out the used register components
15885   for (SDUse &Use : Node->uses()) {
15886 
15887     // Don't look at users of the chain.
15888     if (Use.getResNo() != 0)
15889       continue;
15890 
15891     SDNode *User = Use.getUser();
15892 
15893     // Abort if we can't understand the usage
15894     if (!User->isMachineOpcode() ||
15895         User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15896       return Node;
15897 
15898     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15899     // Note that subregs are packed, i.e. Lane==0 is the first bit set
15900     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15901     // set, etc.
15902     Lane = SubIdx2Lane(User->getConstantOperandVal(1));
15903     if (Lane == ~0u)
15904       return Node;
15905 
15906     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15907     if (UsesTFC && Lane == TFCLane) {
15908       Users[Lane] = User;
15909     } else {
15910       // Set which texture component corresponds to the lane.
15911       unsigned Comp;
15912       for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15913         Comp = llvm::countr_zero(Dmask);
15914         Dmask &= ~(1 << Comp);
15915       }
15916 
15917       // Abort if we have more than one user per component.
15918       if (Users[Lane])
15919         return Node;
15920 
15921       Users[Lane] = User;
15922       NewDmask |= 1 << Comp;
15923     }
15924   }
15925 
15926   // Don't allow 0 dmask, as hardware assumes one channel enabled.
15927   bool NoChannels = !NewDmask;
15928   if (NoChannels) {
15929     if (!UsesTFC) {
15930       // No uses of the result and not using TFC. Then do nothing.
15931       return Node;
15932     }
15933     // If the original dmask has one channel - then nothing to do
15934     if (OldBitsSet == 1)
15935       return Node;
15936     // Use an arbitrary dmask - required for the instruction to work
15937     NewDmask = 1;
15938   }
15939   // Abort if there's no change
15940   if (NewDmask == OldDmask)
15941     return Node;
15942 
15943   unsigned BitsSet = llvm::popcount(NewDmask);
15944 
15945   // Check for TFE or LWE - increase the number of channels by one to account
15946   // for the extra return value
15947   // This will need adjustment for D16 if this is also included in
15948   // adjustWriteMask (this function) but at present D16 are excluded.
15949   unsigned NewChannels = BitsSet + UsesTFC;
15950 
15951   int NewOpcode =
15952       AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
15953   assert(NewOpcode != -1 &&
15954          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15955          "failed to find equivalent MIMG op");
15956 
15957   // Adjust the writemask in the node
15958   SmallVector<SDValue, 12> Ops;
15959   llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
15960   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15961   llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
15962 
15963   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15964 
15965   MVT ResultVT = NewChannels == 1
15966                      ? SVT
15967                      : MVT::getVectorVT(SVT, NewChannels == 3   ? 4
15968                                              : NewChannels == 5 ? 8
15969                                                                 : NewChannels);
15970   SDVTList NewVTList =
15971       HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15972 
15973   MachineSDNode *NewNode =
15974       DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
15975 
15976   if (HasChain) {
15977     // Update chain.
15978     DAG.setNodeMemRefs(NewNode, Node->memoperands());
15979     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15980   }
15981 
15982   if (NewChannels == 1) {
15983     assert(Node->hasNUsesOfValue(1, 0));
15984     SDNode *Copy =
15985         DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
15986                            Users[Lane]->getValueType(0), SDValue(NewNode, 0));
15987     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15988     return nullptr;
15989   }
15990 
15991   // Update the users of the node with the new indices
15992   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15993     SDNode *User = Users[i];
15994     if (!User) {
15995       // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15996       // Users[0] is still nullptr because channel 0 doesn't really have a use.
15997       if (i || !NoChannels)
15998         continue;
15999     } else {
16000       SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16001       SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16002       if (NewUser != User) {
16003         DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16004         DAG.RemoveDeadNode(User);
16005       }
16006     }
16007 
16008     switch (Idx) {
16009     default:
16010       break;
16011     case AMDGPU::sub0:
16012       Idx = AMDGPU::sub1;
16013       break;
16014     case AMDGPU::sub1:
16015       Idx = AMDGPU::sub2;
16016       break;
16017     case AMDGPU::sub2:
16018       Idx = AMDGPU::sub3;
16019       break;
16020     case AMDGPU::sub3:
16021       Idx = AMDGPU::sub4;
16022       break;
16023     }
16024   }
16025 
16026   DAG.RemoveDeadNode(Node);
16027   return nullptr;
16028 }
16029 
16030 static bool isFrameIndexOp(SDValue Op) {
16031   if (Op.getOpcode() == ISD::AssertZext)
16032     Op = Op.getOperand(0);
16033 
16034   return isa<FrameIndexSDNode>(Op);
16035 }
16036 
16037 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
16038 /// with frame index operands.
16039 /// LLVM assumes that inputs are to these instructions are registers.
16040 SDNode *
16041 SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
16042                                                 SelectionDAG &DAG) const {
16043   if (Node->getOpcode() == ISD::CopyToReg) {
16044     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16045     SDValue SrcVal = Node->getOperand(2);
16046 
16047     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16048     // to try understanding copies to physical registers.
16049     if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16050       SDLoc SL(Node);
16051       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16052       SDValue VReg = DAG.getRegister(
16053           MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16054 
16055       SDNode *Glued = Node->getGluedNode();
16056       SDValue ToVReg = DAG.getCopyToReg(
16057           Node->getOperand(0), SL, VReg, SrcVal,
16058           SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16059       SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16060                                              VReg, ToVReg.getValue(1));
16061       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16062       DAG.RemoveDeadNode(Node);
16063       return ToResultReg.getNode();
16064     }
16065   }
16066 
16067   SmallVector<SDValue, 8> Ops;
16068   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16069     if (!isFrameIndexOp(Node->getOperand(i))) {
16070       Ops.push_back(Node->getOperand(i));
16071       continue;
16072     }
16073 
16074     SDLoc DL(Node);
16075     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16076                                              Node->getOperand(i).getValueType(),
16077                                              Node->getOperand(i)),
16078                           0));
16079   }
16080 
16081   return DAG.UpdateNodeOperands(Node, Ops);
16082 }
16083 
16084 /// Fold the instructions after selecting them.
16085 /// Returns null if users were already updated.
16086 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
16087                                           SelectionDAG &DAG) const {
16088   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16089   unsigned Opcode = Node->getMachineOpcode();
16090 
16091   if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16092       !TII->isGather4(Opcode) &&
16093       AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16094     return adjustWritemask(Node, DAG);
16095   }
16096 
16097   if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16098     legalizeTargetIndependentNode(Node, DAG);
16099     return Node;
16100   }
16101 
16102   switch (Opcode) {
16103   case AMDGPU::V_DIV_SCALE_F32_e64:
16104   case AMDGPU::V_DIV_SCALE_F64_e64: {
16105     // Satisfy the operand register constraint when one of the inputs is
16106     // undefined. Ordinarily each undef value will have its own implicit_def of
16107     // a vreg, so force these to use a single register.
16108     SDValue Src0 = Node->getOperand(1);
16109     SDValue Src1 = Node->getOperand(3);
16110     SDValue Src2 = Node->getOperand(5);
16111 
16112     if ((Src0.isMachineOpcode() &&
16113          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16114         (Src0 == Src1 || Src0 == Src2))
16115       break;
16116 
16117     MVT VT = Src0.getValueType().getSimpleVT();
16118     const TargetRegisterClass *RC =
16119         getRegClassFor(VT, Src0.getNode()->isDivergent());
16120 
16121     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16122     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16123 
16124     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16125                                       Src0, SDValue());
16126 
16127     // src0 must be the same register as src1 or src2, even if the value is
16128     // undefined, so make sure we don't violate this constraint.
16129     if (Src0.isMachineOpcode() &&
16130         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16131       if (Src1.isMachineOpcode() &&
16132           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16133         Src0 = Src1;
16134       else if (Src2.isMachineOpcode() &&
16135                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16136         Src0 = Src2;
16137       else {
16138         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16139         Src0 = UndefReg;
16140         Src1 = UndefReg;
16141       }
16142     } else
16143       break;
16144 
16145     SmallVector<SDValue, 9> Ops(Node->ops());
16146     Ops[1] = Src0;
16147     Ops[3] = Src1;
16148     Ops[5] = Src2;
16149     Ops.push_back(ImpDef.getValue(1));
16150     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16151   }
16152   default:
16153     break;
16154   }
16155 
16156   return Node;
16157 }
16158 
16159 // Any MIMG instructions that use tfe or lwe require an initialization of the
16160 // result register that will be written in the case of a memory access failure.
16161 // The required code is also added to tie this init code to the result of the
16162 // img instruction.
16163 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
16164   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16165   const SIRegisterInfo &TRI = TII->getRegisterInfo();
16166   MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16167   MachineBasicBlock &MBB = *MI.getParent();
16168 
16169   int DstIdx =
16170       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16171   unsigned InitIdx = 0;
16172 
16173   if (TII->isImage(MI)) {
16174     MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16175     MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16176     MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16177 
16178     if (!TFE && !LWE) // intersect_ray
16179       return;
16180 
16181     unsigned TFEVal = TFE ? TFE->getImm() : 0;
16182     unsigned LWEVal = LWE ? LWE->getImm() : 0;
16183     unsigned D16Val = D16 ? D16->getImm() : 0;
16184 
16185     if (!TFEVal && !LWEVal)
16186       return;
16187 
16188     // At least one of TFE or LWE are non-zero
16189     // We have to insert a suitable initialization of the result value and
16190     // tie this to the dest of the image instruction.
16191 
16192     // Calculate which dword we have to initialize to 0.
16193     MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16194 
16195     // check that dmask operand is found.
16196     assert(MO_Dmask && "Expected dmask operand in instruction");
16197 
16198     unsigned dmask = MO_Dmask->getImm();
16199     // Determine the number of active lanes taking into account the
16200     // Gather4 special case
16201     unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16202 
16203     bool Packed = !Subtarget->hasUnpackedD16VMem();
16204 
16205     InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16206 
16207     // Abandon attempt if the dst size isn't large enough
16208     // - this is in fact an error but this is picked up elsewhere and
16209     // reported correctly.
16210     uint32_t DstSize =
16211         TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16212     if (DstSize < InitIdx)
16213       return;
16214   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16215     InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16216   } else {
16217     return;
16218   }
16219 
16220   const DebugLoc &DL = MI.getDebugLoc();
16221 
16222   // Create a register for the initialization value.
16223   Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16224   unsigned NewDst = 0; // Final initialized value will be in here
16225 
16226   // If PRTStrictNull feature is enabled (the default) then initialize
16227   // all the result registers to 0, otherwise just the error indication
16228   // register (VGPRn+1)
16229   unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16230   unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16231 
16232   BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16233   for (; SizeLeft; SizeLeft--, CurrIdx++) {
16234     NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16235     // Initialize dword
16236     Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16237     // clang-format off
16238     BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16239         .addImm(0);
16240     // clang-format on
16241     // Insert into the super-reg
16242     BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16243         .addReg(PrevDst)
16244         .addReg(SubReg)
16245         .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
16246 
16247     PrevDst = NewDst;
16248   }
16249 
16250   // Add as an implicit operand
16251   MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
16252 
16253   // Tie the just added implicit operand to the dst
16254   MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
16255 }
16256 
16257 /// Assign the register class depending on the number of
16258 /// bits set in the writemask
16259 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
16260                                                      SDNode *Node) const {
16261   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16262 
16263   MachineFunction *MF = MI.getParent()->getParent();
16264   MachineRegisterInfo &MRI = MF->getRegInfo();
16265   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
16266 
16267   if (TII->isVOP3(MI.getOpcode())) {
16268     // Make sure constant bus requirements are respected.
16269     TII->legalizeOperandsVOP3(MRI, MI);
16270 
16271     // Prefer VGPRs over AGPRs in mAI instructions where possible.
16272     // This saves a chain-copy of registers and better balance register
16273     // use between vgpr and agpr as agpr tuples tend to be big.
16274     if (!MI.getDesc().operands().empty()) {
16275       unsigned Opc = MI.getOpcode();
16276       bool HasAGPRs = Info->mayNeedAGPRs();
16277       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16278       int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
16279       for (auto I :
16280            {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
16281             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
16282         if (I == -1)
16283           break;
16284         if ((I == Src2Idx) && (HasAGPRs))
16285           break;
16286         MachineOperand &Op = MI.getOperand(I);
16287         if (!Op.isReg() || !Op.getReg().isVirtual())
16288           continue;
16289         auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
16290         if (!TRI->hasAGPRs(RC))
16291           continue;
16292         auto *Src = MRI.getUniqueVRegDef(Op.getReg());
16293         if (!Src || !Src->isCopy() ||
16294             !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
16295           continue;
16296         auto *NewRC = TRI->getEquivalentVGPRClass(RC);
16297         // All uses of agpr64 and agpr32 can also accept vgpr except for
16298         // v_accvgpr_read, but we do not produce agpr reads during selection,
16299         // so no use checks are needed.
16300         MRI.setRegClass(Op.getReg(), NewRC);
16301       }
16302 
16303       if (TII->isMAI(MI)) {
16304         // The ordinary src0, src1, src2 were legalized above.
16305         //
16306         // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16307         // as a separate instruction.
16308         int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16309                                                  AMDGPU::OpName::scale_src0);
16310         if (Src0Idx != -1) {
16311           int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16312                                                    AMDGPU::OpName::scale_src1);
16313           if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
16314               TII->usesConstantBus(MRI, MI, Src1Idx))
16315             TII->legalizeOpWithMove(MI, Src1Idx);
16316         }
16317       }
16318 
16319       if (!HasAGPRs)
16320         return;
16321 
16322       // Resolve the rest of AV operands to AGPRs.
16323       if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
16324         if (Src2->isReg() && Src2->getReg().isVirtual()) {
16325           auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
16326           if (TRI->isVectorSuperClass(RC)) {
16327             auto *NewRC = TRI->getEquivalentAGPRClass(RC);
16328             MRI.setRegClass(Src2->getReg(), NewRC);
16329             if (Src2->isTied())
16330               MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
16331           }
16332         }
16333       }
16334     }
16335 
16336     return;
16337   }
16338 
16339   if (TII->isImage(MI))
16340     TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
16341 }
16342 
16343 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
16344                               uint64_t Val) {
16345   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
16346   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
16347 }
16348 
16349 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
16350                                                 const SDLoc &DL,
16351                                                 SDValue Ptr) const {
16352   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16353 
16354   // Build the half of the subregister with the constants before building the
16355   // full 128-bit register. If we are building multiple resource descriptors,
16356   // this will allow CSEing of the 2-component register.
16357   const SDValue Ops0[] = {
16358       DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
16359       buildSMovImm32(DAG, DL, 0),
16360       DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16361       buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
16362       DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
16363 
16364   SDValue SubRegHi = SDValue(
16365       DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
16366 
16367   // Combine the constants and the pointer.
16368   const SDValue Ops1[] = {
16369       DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
16370       DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
16371       DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
16372 
16373   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
16374 }
16375 
16376 /// Return a resource descriptor with the 'Add TID' bit enabled
16377 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16378 ///        of the resource descriptor) to create an offset, which is added to
16379 ///        the resource pointer.
16380 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
16381                                            SDValue Ptr, uint32_t RsrcDword1,
16382                                            uint64_t RsrcDword2And3) const {
16383   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
16384   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
16385   if (RsrcDword1) {
16386     PtrHi =
16387         SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
16388                                    DAG.getConstant(RsrcDword1, DL, MVT::i32)),
16389                 0);
16390   }
16391 
16392   SDValue DataLo =
16393       buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16394   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
16395 
16396   const SDValue Ops[] = {
16397       DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
16398       PtrLo,
16399       DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16400       PtrHi,
16401       DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
16402       DataLo,
16403       DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
16404       DataHi,
16405       DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
16406 
16407   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
16408 }
16409 
16410 //===----------------------------------------------------------------------===//
16411 //                         SI Inline Assembly Support
16412 //===----------------------------------------------------------------------===//
16413 
16414 std::pair<unsigned, const TargetRegisterClass *>
16415 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
16416                                                StringRef Constraint,
16417                                                MVT VT) const {
16418   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16419 
16420   const TargetRegisterClass *RC = nullptr;
16421   if (Constraint.size() == 1) {
16422     const unsigned BitWidth = VT.getSizeInBits();
16423     switch (Constraint[0]) {
16424     default:
16425       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16426     case 's':
16427     case 'r':
16428       switch (BitWidth) {
16429       case 16:
16430         RC = &AMDGPU::SReg_32RegClass;
16431         break;
16432       case 64:
16433         RC = &AMDGPU::SGPR_64RegClass;
16434         break;
16435       default:
16436         RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
16437         if (!RC)
16438           return std::pair(0U, nullptr);
16439         break;
16440       }
16441       break;
16442     case 'v':
16443       switch (BitWidth) {
16444       case 16:
16445         RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16446                                              : &AMDGPU::VGPR_32RegClass;
16447         break;
16448       default:
16449         RC = TRI->getVGPRClassForBitWidth(BitWidth);
16450         if (!RC)
16451           return std::pair(0U, nullptr);
16452         break;
16453       }
16454       break;
16455     case 'a':
16456       if (!Subtarget->hasMAIInsts())
16457         break;
16458       switch (BitWidth) {
16459       case 16:
16460         RC = &AMDGPU::AGPR_32RegClass;
16461         break;
16462       default:
16463         RC = TRI->getAGPRClassForBitWidth(BitWidth);
16464         if (!RC)
16465           return std::pair(0U, nullptr);
16466         break;
16467       }
16468       break;
16469     }
16470     // We actually support i128, i16 and f16 as inline parameters
16471     // even if they are not reported as legal
16472     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
16473                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
16474       return std::pair(0U, RC);
16475   }
16476 
16477   if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
16478     StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
16479     if (RegName.consume_front("v")) {
16480       RC = &AMDGPU::VGPR_32RegClass;
16481     } else if (RegName.consume_front("s")) {
16482       RC = &AMDGPU::SGPR_32RegClass;
16483     } else if (RegName.consume_front("a")) {
16484       RC = &AMDGPU::AGPR_32RegClass;
16485     }
16486 
16487     if (RC) {
16488       uint32_t Idx;
16489       if (RegName.consume_front("[")) {
16490         uint32_t End;
16491         bool Failed = RegName.consumeInteger(10, Idx);
16492         Failed |= !RegName.consume_front(":");
16493         Failed |= RegName.consumeInteger(10, End);
16494         Failed |= !RegName.consume_back("]");
16495         if (!Failed) {
16496           uint32_t Width = (End - Idx + 1) * 32;
16497           // Prohibit constraints for register ranges with a width that does not
16498           // match the required type.
16499           if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16500             return std::pair(0U, nullptr);
16501           MCRegister Reg = RC->getRegister(Idx);
16502           if (SIRegisterInfo::isVGPRClass(RC))
16503             RC = TRI->getVGPRClassForBitWidth(Width);
16504           else if (SIRegisterInfo::isSGPRClass(RC))
16505             RC = TRI->getSGPRClassForBitWidth(Width);
16506           else if (SIRegisterInfo::isAGPRClass(RC))
16507             RC = TRI->getAGPRClassForBitWidth(Width);
16508           if (RC) {
16509             Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
16510             if (!Reg) {
16511               // The register class does not contain the requested register,
16512               // e.g., because it is an SGPR pair that would violate alignment
16513               // requirements.
16514               return std::pair(0U, nullptr);
16515             }
16516             return std::pair(Reg, RC);
16517           }
16518         }
16519       } else {
16520         // Check for lossy scalar/vector conversions.
16521         if (VT.isVector() && VT.getSizeInBits() != 32)
16522           return std::pair(0U, nullptr);
16523         bool Failed = RegName.getAsInteger(10, Idx);
16524         if (!Failed && Idx < RC->getNumRegs())
16525           return std::pair(RC->getRegister(Idx), RC);
16526       }
16527     }
16528   }
16529 
16530   auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16531   if (Ret.first)
16532     Ret.second = TRI->getPhysRegBaseClass(Ret.first);
16533 
16534   return Ret;
16535 }
16536 
16537 static bool isImmConstraint(StringRef Constraint) {
16538   if (Constraint.size() == 1) {
16539     switch (Constraint[0]) {
16540     default:
16541       break;
16542     case 'I':
16543     case 'J':
16544     case 'A':
16545     case 'B':
16546     case 'C':
16547       return true;
16548     }
16549   } else if (Constraint == "DA" || Constraint == "DB") {
16550     return true;
16551   }
16552   return false;
16553 }
16554 
16555 SITargetLowering::ConstraintType
16556 SITargetLowering::getConstraintType(StringRef Constraint) const {
16557   if (Constraint.size() == 1) {
16558     switch (Constraint[0]) {
16559     default:
16560       break;
16561     case 's':
16562     case 'v':
16563     case 'a':
16564       return C_RegisterClass;
16565     }
16566   }
16567   if (isImmConstraint(Constraint)) {
16568     return C_Other;
16569   }
16570   return TargetLowering::getConstraintType(Constraint);
16571 }
16572 
16573 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16574   if (!AMDGPU::isInlinableIntLiteral(Val)) {
16575     Val = Val & maskTrailingOnes<uint64_t>(Size);
16576   }
16577   return Val;
16578 }
16579 
16580 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16581                                                     StringRef Constraint,
16582                                                     std::vector<SDValue> &Ops,
16583                                                     SelectionDAG &DAG) const {
16584   if (isImmConstraint(Constraint)) {
16585     uint64_t Val;
16586     if (getAsmOperandConstVal(Op, Val) &&
16587         checkAsmConstraintVal(Op, Constraint, Val)) {
16588       Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
16589       Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
16590     }
16591   } else {
16592     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16593   }
16594 }
16595 
16596 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
16597   unsigned Size = Op.getScalarValueSizeInBits();
16598   if (Size > 64)
16599     return false;
16600 
16601   if (Size == 16 && !Subtarget->has16BitInsts())
16602     return false;
16603 
16604   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16605     Val = C->getSExtValue();
16606     return true;
16607   }
16608   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
16609     Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16610     return true;
16611   }
16612   if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
16613     if (Size != 16 || Op.getNumOperands() != 2)
16614       return false;
16615     if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
16616       return false;
16617     if (ConstantSDNode *C = V->getConstantSplatNode()) {
16618       Val = C->getSExtValue();
16619       return true;
16620     }
16621     if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16622       Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16623       return true;
16624     }
16625   }
16626 
16627   return false;
16628 }
16629 
16630 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
16631                                              uint64_t Val) const {
16632   if (Constraint.size() == 1) {
16633     switch (Constraint[0]) {
16634     case 'I':
16635       return AMDGPU::isInlinableIntLiteral(Val);
16636     case 'J':
16637       return isInt<16>(Val);
16638     case 'A':
16639       return checkAsmConstraintValA(Op, Val);
16640     case 'B':
16641       return isInt<32>(Val);
16642     case 'C':
16643       return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
16644              AMDGPU::isInlinableIntLiteral(Val);
16645     default:
16646       break;
16647     }
16648   } else if (Constraint.size() == 2) {
16649     if (Constraint == "DA") {
16650       int64_t HiBits = static_cast<int32_t>(Val >> 32);
16651       int64_t LoBits = static_cast<int32_t>(Val);
16652       return checkAsmConstraintValA(Op, HiBits, 32) &&
16653              checkAsmConstraintValA(Op, LoBits, 32);
16654     }
16655     if (Constraint == "DB") {
16656       return true;
16657     }
16658   }
16659   llvm_unreachable("Invalid asm constraint");
16660 }
16661 
16662 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
16663                                               unsigned MaxSize) const {
16664   unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
16665   bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16666   if (Size == 16) {
16667     MVT VT = Op.getSimpleValueType();
16668     switch (VT.SimpleTy) {
16669     default:
16670       return false;
16671     case MVT::i16:
16672       return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
16673     case MVT::f16:
16674       return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
16675     case MVT::bf16:
16676       return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
16677     case MVT::v2i16:
16678       return AMDGPU::getInlineEncodingV2I16(Val).has_value();
16679     case MVT::v2f16:
16680       return AMDGPU::getInlineEncodingV2F16(Val).has_value();
16681     case MVT::v2bf16:
16682       return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
16683     }
16684   }
16685   if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
16686       (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
16687     return true;
16688   return false;
16689 }
16690 
16691 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16692   switch (UnalignedClassID) {
16693   case AMDGPU::VReg_64RegClassID:
16694     return AMDGPU::VReg_64_Align2RegClassID;
16695   case AMDGPU::VReg_96RegClassID:
16696     return AMDGPU::VReg_96_Align2RegClassID;
16697   case AMDGPU::VReg_128RegClassID:
16698     return AMDGPU::VReg_128_Align2RegClassID;
16699   case AMDGPU::VReg_160RegClassID:
16700     return AMDGPU::VReg_160_Align2RegClassID;
16701   case AMDGPU::VReg_192RegClassID:
16702     return AMDGPU::VReg_192_Align2RegClassID;
16703   case AMDGPU::VReg_224RegClassID:
16704     return AMDGPU::VReg_224_Align2RegClassID;
16705   case AMDGPU::VReg_256RegClassID:
16706     return AMDGPU::VReg_256_Align2RegClassID;
16707   case AMDGPU::VReg_288RegClassID:
16708     return AMDGPU::VReg_288_Align2RegClassID;
16709   case AMDGPU::VReg_320RegClassID:
16710     return AMDGPU::VReg_320_Align2RegClassID;
16711   case AMDGPU::VReg_352RegClassID:
16712     return AMDGPU::VReg_352_Align2RegClassID;
16713   case AMDGPU::VReg_384RegClassID:
16714     return AMDGPU::VReg_384_Align2RegClassID;
16715   case AMDGPU::VReg_512RegClassID:
16716     return AMDGPU::VReg_512_Align2RegClassID;
16717   case AMDGPU::VReg_1024RegClassID:
16718     return AMDGPU::VReg_1024_Align2RegClassID;
16719   case AMDGPU::AReg_64RegClassID:
16720     return AMDGPU::AReg_64_Align2RegClassID;
16721   case AMDGPU::AReg_96RegClassID:
16722     return AMDGPU::AReg_96_Align2RegClassID;
16723   case AMDGPU::AReg_128RegClassID:
16724     return AMDGPU::AReg_128_Align2RegClassID;
16725   case AMDGPU::AReg_160RegClassID:
16726     return AMDGPU::AReg_160_Align2RegClassID;
16727   case AMDGPU::AReg_192RegClassID:
16728     return AMDGPU::AReg_192_Align2RegClassID;
16729   case AMDGPU::AReg_256RegClassID:
16730     return AMDGPU::AReg_256_Align2RegClassID;
16731   case AMDGPU::AReg_512RegClassID:
16732     return AMDGPU::AReg_512_Align2RegClassID;
16733   case AMDGPU::AReg_1024RegClassID:
16734     return AMDGPU::AReg_1024_Align2RegClassID;
16735   default:
16736     return -1;
16737   }
16738 }
16739 
16740 // Figure out which registers should be reserved for stack access. Only after
16741 // the function is legalized do we know all of the non-spill stack objects or if
16742 // calls are present.
16743 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
16744   MachineRegisterInfo &MRI = MF.getRegInfo();
16745   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16746   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16747   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16748   const SIInstrInfo *TII = ST.getInstrInfo();
16749 
16750   if (Info->isEntryFunction()) {
16751     // Callable functions have fixed registers used for stack access.
16752     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
16753   }
16754 
16755   // TODO: Move this logic to getReservedRegs()
16756   // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16757   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16758   Register SReg = ST.isWave32()
16759                       ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
16760                       : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16761                                                      &AMDGPU::SGPR_64RegClass);
16762   Info->setSGPRForEXECCopy(SReg);
16763 
16764   assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16765                              Info->getStackPtrOffsetReg()));
16766   if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16767     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
16768 
16769   // We need to worry about replacing the default register with itself in case
16770   // of MIR testcases missing the MFI.
16771   if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16772     MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
16773 
16774   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16775     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
16776 
16777   Info->limitOccupancy(MF);
16778 
16779   if (ST.isWave32() && !MF.empty()) {
16780     for (auto &MBB : MF) {
16781       for (auto &MI : MBB) {
16782         TII->fixImplicitOperands(MI);
16783       }
16784     }
16785   }
16786 
16787   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16788   // classes if required. Ideally the register class constraints would differ
16789   // per-subtarget, but there's no easy way to achieve that right now. This is
16790   // not a problem for VGPRs because the correctly aligned VGPR class is implied
16791   // from using them as the register class for legal types.
16792   if (ST.needsAlignedVGPRs()) {
16793     for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16794       const Register Reg = Register::index2VirtReg(I);
16795       const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16796       if (!RC)
16797         continue;
16798       int NewClassID = getAlignedAGPRClassID(RC->getID());
16799       if (NewClassID != -1)
16800         MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
16801     }
16802   }
16803 
16804   TargetLoweringBase::finalizeLowering(MF);
16805 }
16806 
16807 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16808                                                      KnownBits &Known,
16809                                                      const APInt &DemandedElts,
16810                                                      const SelectionDAG &DAG,
16811                                                      unsigned Depth) const {
16812   Known.resetAll();
16813   unsigned Opc = Op.getOpcode();
16814   switch (Opc) {
16815   case ISD::INTRINSIC_WO_CHAIN: {
16816     unsigned IID = Op.getConstantOperandVal(0);
16817     switch (IID) {
16818     case Intrinsic::amdgcn_mbcnt_lo:
16819     case Intrinsic::amdgcn_mbcnt_hi: {
16820       const GCNSubtarget &ST =
16821           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
16822       // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16823       // most 31 + src1.
16824       Known.Zero.setBitsFrom(
16825           IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16826       KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
16827       Known = KnownBits::add(Known, Known2);
16828       return;
16829     }
16830     }
16831     break;
16832   }
16833   }
16834   return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
16835       Op, Known, DemandedElts, DAG, Depth);
16836 }
16837 
16838 void SITargetLowering::computeKnownBitsForFrameIndex(
16839     const int FI, KnownBits &Known, const MachineFunction &MF) const {
16840   TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
16841 
16842   // Set the high bits to zero based on the maximum allowed scratch size per
16843   // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16844   // calculation won't overflow, so assume the sign bit is never set.
16845   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16846 }
16847 
16848 static void knownBitsForWorkitemID(const GCNSubtarget &ST,
16849                                    GISelValueTracking &VT, KnownBits &Known,
16850                                    unsigned Dim) {
16851   unsigned MaxValue =
16852       ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
16853   Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
16854 }
16855 
16856 void SITargetLowering::computeKnownBitsForTargetInstr(
16857     GISelValueTracking &VT, Register R, KnownBits &Known,
16858     const APInt &DemandedElts, const MachineRegisterInfo &MRI,
16859     unsigned Depth) const {
16860   const MachineInstr *MI = MRI.getVRegDef(R);
16861   switch (MI->getOpcode()) {
16862   case AMDGPU::G_INTRINSIC:
16863   case AMDGPU::G_INTRINSIC_CONVERGENT: {
16864     Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
16865     switch (IID) {
16866     case Intrinsic::amdgcn_workitem_id_x:
16867       knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
16868       break;
16869     case Intrinsic::amdgcn_workitem_id_y:
16870       knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
16871       break;
16872     case Intrinsic::amdgcn_workitem_id_z:
16873       knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
16874       break;
16875     case Intrinsic::amdgcn_mbcnt_lo:
16876     case Intrinsic::amdgcn_mbcnt_hi: {
16877       // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16878       // most 31 + src1.
16879       Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16880                                  ? getSubtarget()->getWavefrontSizeLog2()
16881                                  : 5);
16882       KnownBits Known2;
16883       VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
16884                               Depth + 1);
16885       Known = KnownBits::add(Known, Known2);
16886       break;
16887     }
16888     case Intrinsic::amdgcn_groupstaticsize: {
16889       // We can report everything over the maximum size as 0. We can't report
16890       // based on the actual size because we don't know if it's accurate or not
16891       // at any given point.
16892       Known.Zero.setHighBits(
16893           llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
16894       break;
16895     }
16896     }
16897     break;
16898   }
16899   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16900     Known.Zero.setHighBits(24);
16901     break;
16902   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16903     Known.Zero.setHighBits(16);
16904     break;
16905   case AMDGPU::G_AMDGPU_SMED3:
16906   case AMDGPU::G_AMDGPU_UMED3: {
16907     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16908 
16909     KnownBits Known2;
16910     VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
16911     if (Known2.isUnknown())
16912       break;
16913 
16914     KnownBits Known1;
16915     VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
16916     if (Known1.isUnknown())
16917       break;
16918 
16919     KnownBits Known0;
16920     VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
16921     if (Known0.isUnknown())
16922       break;
16923 
16924     // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16925     Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16926     Known.One = Known0.One & Known1.One & Known2.One;
16927     break;
16928   }
16929   }
16930 }
16931 
16932 Align SITargetLowering::computeKnownAlignForTargetInstr(
16933     GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
16934     unsigned Depth) const {
16935   const MachineInstr *MI = MRI.getVRegDef(R);
16936   if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
16937     // FIXME: Can this move to generic code? What about the case where the call
16938     // site specifies a lower alignment?
16939     Intrinsic::ID IID = GI->getIntrinsicID();
16940     LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
16941     AttributeList Attrs =
16942         Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
16943     if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16944       return *RetAlign;
16945   }
16946   return Align(1);
16947 }
16948 
16949 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16950   const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
16951   const Align CacheLineAlign = Align(64);
16952 
16953   // Pre-GFX10 target did not benefit from loop alignment
16954   if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16955       getSubtarget()->hasInstFwdPrefetchBug())
16956     return PrefAlign;
16957 
16958   // On GFX10 I$ is 4 x 64 bytes cache lines.
16959   // By default prefetcher keeps one cache line behind and reads two ahead.
16960   // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16961   // behind and one ahead.
16962   // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16963   // If loop fits 64 bytes it always spans no more than two cache lines and
16964   // does not need an alignment.
16965   // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16966   // Else if loop is less or equal 192 bytes we need two lines behind.
16967 
16968   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16969   const MachineBasicBlock *Header = ML->getHeader();
16970   if (Header->getAlignment() != PrefAlign)
16971     return Header->getAlignment(); // Already processed.
16972 
16973   unsigned LoopSize = 0;
16974   for (const MachineBasicBlock *MBB : ML->blocks()) {
16975     // If inner loop block is aligned assume in average half of the alignment
16976     // size to be added as nops.
16977     if (MBB != Header)
16978       LoopSize += MBB->getAlignment().value() / 2;
16979 
16980     for (const MachineInstr &MI : *MBB) {
16981       LoopSize += TII->getInstSizeInBytes(MI);
16982       if (LoopSize > 192)
16983         return PrefAlign;
16984     }
16985   }
16986 
16987   if (LoopSize <= 64)
16988     return PrefAlign;
16989 
16990   if (LoopSize <= 128)
16991     return CacheLineAlign;
16992 
16993   // If any of parent loops is surrounded by prefetch instructions do not
16994   // insert new for inner loop, which would reset parent's settings.
16995   for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16996     if (MachineBasicBlock *Exit = P->getExitBlock()) {
16997       auto I = Exit->getFirstNonDebugInstr();
16998       if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16999         return CacheLineAlign;
17000     }
17001   }
17002 
17003   MachineBasicBlock *Pre = ML->getLoopPreheader();
17004   MachineBasicBlock *Exit = ML->getExitBlock();
17005 
17006   if (Pre && Exit) {
17007     auto PreTerm = Pre->getFirstTerminator();
17008     if (PreTerm == Pre->begin() ||
17009         std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17010       BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17011           .addImm(1); // prefetch 2 lines behind PC
17012 
17013     auto ExitHead = Exit->getFirstNonDebugInstr();
17014     if (ExitHead == Exit->end() ||
17015         ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17016       BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17017           .addImm(2); // prefetch 1 line behind PC
17018   }
17019 
17020   return CacheLineAlign;
17021 }
17022 
17023 LLVM_ATTRIBUTE_UNUSED
17024 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17025   assert(N->getOpcode() == ISD::CopyFromReg);
17026   do {
17027     // Follow the chain until we find an INLINEASM node.
17028     N = N->getOperand(0).getNode();
17029     if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17030       return true;
17031   } while (N->getOpcode() == ISD::CopyFromReg);
17032   return false;
17033 }
17034 
17035 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
17036                                                   FunctionLoweringInfo *FLI,
17037                                                   UniformityInfo *UA) const {
17038   switch (N->getOpcode()) {
17039   case ISD::CopyFromReg: {
17040     const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17041     const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17042     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17043     Register Reg = R->getReg();
17044 
17045     // FIXME: Why does this need to consider isLiveIn?
17046     if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17047       return !TRI->isSGPRReg(MRI, Reg);
17048 
17049     if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17050       return UA->isDivergent(V);
17051 
17052     assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
17053     return !TRI->isSGPRReg(MRI, Reg);
17054   }
17055   case ISD::LOAD: {
17056     const LoadSDNode *L = cast<LoadSDNode>(N);
17057     unsigned AS = L->getAddressSpace();
17058     // A flat load may access private memory.
17059     return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
17060   }
17061   case ISD::CALLSEQ_END:
17062     return true;
17063   case ISD::INTRINSIC_WO_CHAIN:
17064     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17065   case ISD::INTRINSIC_W_CHAIN:
17066     return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17067   case AMDGPUISD::ATOMIC_CMP_SWAP:
17068   case AMDGPUISD::BUFFER_ATOMIC_SWAP:
17069   case AMDGPUISD::BUFFER_ATOMIC_ADD:
17070   case AMDGPUISD::BUFFER_ATOMIC_SUB:
17071   case AMDGPUISD::BUFFER_ATOMIC_SMIN:
17072   case AMDGPUISD::BUFFER_ATOMIC_UMIN:
17073   case AMDGPUISD::BUFFER_ATOMIC_SMAX:
17074   case AMDGPUISD::BUFFER_ATOMIC_UMAX:
17075   case AMDGPUISD::BUFFER_ATOMIC_AND:
17076   case AMDGPUISD::BUFFER_ATOMIC_OR:
17077   case AMDGPUISD::BUFFER_ATOMIC_XOR:
17078   case AMDGPUISD::BUFFER_ATOMIC_INC:
17079   case AMDGPUISD::BUFFER_ATOMIC_DEC:
17080   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
17081   case AMDGPUISD::BUFFER_ATOMIC_CSUB:
17082   case AMDGPUISD::BUFFER_ATOMIC_FADD:
17083   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
17084   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
17085     // Target-specific read-modify-write atomics are sources of divergence.
17086     return true;
17087   default:
17088     if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17089       // Generic read-modify-write atomics are sources of divergence.
17090       return A->readMem() && A->writeMem();
17091     }
17092     return false;
17093   }
17094 }
17095 
17096 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
17097                                                EVT VT) const {
17098   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17099   case MVT::f32:
17100     return !denormalModeIsFlushAllF32(DAG.getMachineFunction());
17101   case MVT::f64:
17102   case MVT::f16:
17103     return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
17104   default:
17105     return false;
17106   }
17107 }
17108 
17109 bool SITargetLowering::denormalsEnabledForType(
17110     LLT Ty, const MachineFunction &MF) const {
17111   switch (Ty.getScalarSizeInBits()) {
17112   case 32:
17113     return !denormalModeIsFlushAllF32(MF);
17114   case 64:
17115   case 16:
17116     return !denormalModeIsFlushAllF64F16(MF);
17117   default:
17118     return false;
17119   }
17120 }
17121 
17122 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
17123                                                     const APInt &DemandedElts,
17124                                                     const SelectionDAG &DAG,
17125                                                     bool SNaN,
17126                                                     unsigned Depth) const {
17127   if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17128     const MachineFunction &MF = DAG.getMachineFunction();
17129     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
17130 
17131     if (Info->getMode().DX10Clamp)
17132       return true; // Clamped to 0.
17133     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17134   }
17135 
17136   return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
17137                                                             DAG, SNaN, Depth);
17138 }
17139 
17140 // On older subtargets, global FP atomic instructions have a hardcoded FP mode
17141 // and do not support FP32 denormals, and only support v2f16/f64 denormals.
17142 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
17143   if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17144     return true;
17145 
17146   const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
17147   auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17148   if (DenormMode == DenormalMode::getPreserveSign())
17149     return true;
17150 
17151   // TODO: Remove this.
17152   return RMW->getFunction()
17153       ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17154       .getValueAsBool();
17155 }
17156 
17157 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
17158   LLVMContext &Ctx = RMW->getContext();
17159   StringRef MemScope =
17160       Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17161 
17162   return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17163          << "Hardware instruction generated for atomic "
17164          << RMW->getOperationName(RMW->getOperation())
17165          << " operation at memory scope " << MemScope;
17166 }
17167 
17168 static bool isV2F16OrV2BF16(Type *Ty) {
17169   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17170     Type *EltTy = VT->getElementType();
17171     return VT->getNumElements() == 2 &&
17172            (EltTy->isHalfTy() || EltTy->isBFloatTy());
17173   }
17174 
17175   return false;
17176 }
17177 
17178 static bool isV2F16(Type *Ty) {
17179   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17180   return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17181 }
17182 
17183 static bool isV2BF16(Type *Ty) {
17184   FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
17185   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17186 }
17187 
17188 /// \return true if atomicrmw integer ops work for the type.
17189 static bool isAtomicRMWLegalIntTy(Type *Ty) {
17190   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
17191     unsigned BW = IT->getBitWidth();
17192     return BW == 32 || BW == 64;
17193   }
17194 
17195   return false;
17196 }
17197 
17198 /// \return true if this atomicrmw xchg type can be selected.
17199 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17200   Type *Ty = RMW->getType();
17201   if (isAtomicRMWLegalIntTy(Ty))
17202     return true;
17203 
17204   if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
17205     const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17206     unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
17207     return BW == 32 || BW == 64;
17208   }
17209 
17210   if (Ty->isFloatTy() || Ty->isDoubleTy())
17211     return true;
17212 
17213   if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
17214     return VT->getNumElements() == 2 &&
17215            VT->getElementType()->getPrimitiveSizeInBits() == 16;
17216   }
17217 
17218   return false;
17219 }
17220 
17221 /// \returns true if it's valid to emit a native instruction for \p RMW, based
17222 /// on the properties of the target memory.
17223 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17224                                         const AtomicRMWInst *RMW,
17225                                         bool HasSystemScope) {
17226   // The remote/fine-grained access logic is different from the integer
17227   // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17228   // fine-grained access does not work, even for a device local allocation.
17229   //
17230   // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17231   // allocations work.
17232   if (HasSystemScope) {
17233     if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
17234         RMW->hasMetadata("amdgpu.no.remote.memory"))
17235       return true;
17236   } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17237     return true;
17238 
17239   return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
17240 }
17241 
17242 /// \return Action to perform on AtomicRMWInsts for integer operations.
17243 static TargetLowering::AtomicExpansionKind
17244 atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
17245   return isAtomicRMWLegalIntTy(RMW->getType())
17246              ? TargetLowering::AtomicExpansionKind::None
17247              : TargetLowering::AtomicExpansionKind::CmpXChg;
17248 }
17249 
17250 /// Return if a flat address space atomicrmw can access private memory.
17251 static bool flatInstrMayAccessPrivate(const Instruction *I) {
17252   const MDNode *NoaliasAddrSpaceMD =
17253       I->getMetadata(LLVMContext::MD_noalias_addrspace);
17254   if (!NoaliasAddrSpaceMD)
17255     return true;
17256 
17257   for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
17258        ++I) {
17259     auto *Low = mdconst::extract<ConstantInt>(
17260         NoaliasAddrSpaceMD->getOperand(2 * I + 0));
17261     if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
17262       auto *High = mdconst::extract<ConstantInt>(
17263           NoaliasAddrSpaceMD->getOperand(2 * I + 1));
17264       return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
17265     }
17266   }
17267 
17268   return true;
17269 }
17270 
17271 TargetLowering::AtomicExpansionKind
17272 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17273   unsigned AS = RMW->getPointerAddressSpace();
17274   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17275     return AtomicExpansionKind::NotAtomic;
17276 
17277   // 64-bit flat atomics that dynamically reside in private memory will silently
17278   // be dropped.
17279   //
17280   // Note that we will emit a new copy of the original atomic in the expansion,
17281   // which will be incrementally relegalized.
17282   const DataLayout &DL = RMW->getFunction()->getDataLayout();
17283   if (AS == AMDGPUAS::FLAT_ADDRESS &&
17284       DL.getTypeSizeInBits(RMW->getType()) == 64 &&
17285       flatInstrMayAccessPrivate(RMW))
17286     return AtomicExpansionKind::Expand;
17287 
17288   auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17289     OptimizationRemarkEmitter ORE(RMW->getFunction());
17290     ORE.emit([=]() {
17291       return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17292     });
17293     return Kind;
17294   };
17295 
17296   auto SSID = RMW->getSyncScopeID();
17297   bool HasSystemScope =
17298       SSID == SyncScope::System ||
17299       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
17300 
17301   auto Op = RMW->getOperation();
17302   switch (Op) {
17303   case AtomicRMWInst::Xchg: {
17304     // PCIe supports add and xchg for system atomics.
17305     return isAtomicRMWLegalXChgTy(RMW)
17306                ? TargetLowering::AtomicExpansionKind::None
17307                : TargetLowering::AtomicExpansionKind::CmpXChg;
17308   }
17309   case AtomicRMWInst::Add:
17310   case AtomicRMWInst::And:
17311   case AtomicRMWInst::UIncWrap:
17312   case AtomicRMWInst::UDecWrap:
17313     return atomicSupportedIfLegalIntType(RMW);
17314   case AtomicRMWInst::Sub:
17315   case AtomicRMWInst::Or:
17316   case AtomicRMWInst::Xor: {
17317     // Atomic sub/or/xor do not work over PCI express, but atomic add
17318     // does. InstCombine transforms these with 0 to or, so undo that.
17319     if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17320       if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17321           ConstVal && ConstVal->isNullValue())
17322         return AtomicExpansionKind::Expand;
17323     }
17324 
17325     return atomicSupportedIfLegalIntType(RMW);
17326   }
17327   case AtomicRMWInst::FAdd: {
17328     Type *Ty = RMW->getType();
17329 
17330     // TODO: Handle REGION_ADDRESS
17331     if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17332       // DS F32 FP atomics do respect the denormal mode, but the rounding mode
17333       // is fixed to round-to-nearest-even.
17334       //
17335       // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17336       // round-to-nearest-even.
17337       //
17338       // We ignore the rounding mode problem, even in strictfp. The C++ standard
17339       // suggests it is OK if the floating-point mode may not match the calling
17340       // thread.
17341       if (Ty->isFloatTy()) {
17342         return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
17343                                                  : AtomicExpansionKind::CmpXChg;
17344       }
17345 
17346       if (Ty->isDoubleTy()) {
17347         // Ignores denormal mode, but we don't consider flushing mandatory.
17348         return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
17349                                                  : AtomicExpansionKind::CmpXChg;
17350       }
17351 
17352       if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17353         return AtomicExpansionKind::None;
17354 
17355       return AtomicExpansionKind::CmpXChg;
17356     }
17357 
17358     // LDS atomics respect the denormal mode from the mode register.
17359     //
17360     // Traditionally f32 global/buffer memory atomics would unconditionally
17361     // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17362     // flush.
17363     //
17364     // On targets with flat atomic fadd, denormals would flush depending on
17365     // whether the target address resides in LDS or global memory. We consider
17366     // this flat-maybe-flush as will-flush.
17367     if (Ty->isFloatTy() &&
17368         !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
17369         !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
17370       return AtomicExpansionKind::CmpXChg;
17371 
17372     // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17373     // safe. The message phrasing also should be better.
17374     if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17375       if (AS == AMDGPUAS::FLAT_ADDRESS) {
17376         // gfx942, gfx12
17377         if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17378           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17379       } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17380         // gfx90a, gfx942, gfx12
17381         if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17382           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17383 
17384         // gfx942, gfx12
17385         if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17386           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17387       } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17388         // gfx90a, gfx942, gfx12
17389         if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17390           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17391 
17392         // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17393         // buffer. gfx12 does have the buffer version.
17394         if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17395           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17396       }
17397 
17398       // global and flat atomic fadd f64: gfx90a, gfx942.
17399       if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17400         return ReportUnsafeHWInst(AtomicExpansionKind::None);
17401 
17402       if (AS != AMDGPUAS::FLAT_ADDRESS) {
17403         if (Ty->isFloatTy()) {
17404           // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17405           // gfx11+.
17406           if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17407             return ReportUnsafeHWInst(AtomicExpansionKind::None);
17408           // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17409           if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17410             return ReportUnsafeHWInst(AtomicExpansionKind::None);
17411         } else {
17412           // gfx908
17413           if (RMW->use_empty() &&
17414               Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
17415               isV2F16(Ty))
17416             return ReportUnsafeHWInst(AtomicExpansionKind::None);
17417         }
17418       }
17419 
17420       // flat atomic fadd f32: gfx942, gfx11+.
17421       if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17422         if (Subtarget->hasFlatAtomicFaddF32Inst())
17423           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17424 
17425         // If it is in flat address space, and the type is float, we will try to
17426         // expand it, if the target supports global and lds atomic fadd. The
17427         // reason we need that is, in the expansion, we emit the check of
17428         // address space. If it is in global address space, we emit the global
17429         // atomic fadd; if it is in shared address space, we emit the LDS atomic
17430         // fadd.
17431         if (Subtarget->hasLDSFPAtomicAddF32()) {
17432           if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17433             return AtomicExpansionKind::Expand;
17434           if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17435             return AtomicExpansionKind::Expand;
17436         }
17437       }
17438     }
17439 
17440     return AtomicExpansionKind::CmpXChg;
17441   }
17442   case AtomicRMWInst::FMin:
17443   case AtomicRMWInst::FMax: {
17444     Type *Ty = RMW->getType();
17445 
17446     // LDS float and double fmin/fmax were always supported.
17447     if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17448       return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
17449                                                  : AtomicExpansionKind::CmpXChg;
17450     }
17451 
17452     if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
17453       // For flat and global cases:
17454       // float, double in gfx7. Manual claims denormal support.
17455       // Removed in gfx8.
17456       // float, double restored in gfx10.
17457       // double removed again in gfx11, so only f32 for gfx11/gfx12.
17458       //
17459       // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
17460       // no f32.
17461       if (AS == AMDGPUAS::FLAT_ADDRESS) {
17462         if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
17463           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17464         if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
17465           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17466       } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
17467                  AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17468         if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
17469           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17470         if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
17471           return ReportUnsafeHWInst(AtomicExpansionKind::None);
17472       }
17473     }
17474 
17475     return AtomicExpansionKind::CmpXChg;
17476   }
17477   case AtomicRMWInst::Min:
17478   case AtomicRMWInst::Max:
17479   case AtomicRMWInst::UMin:
17480   case AtomicRMWInst::UMax: {
17481     if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17482         AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17483       // Always expand system scope min/max atomics.
17484       if (HasSystemScope)
17485         return AtomicExpansionKind::CmpXChg;
17486     }
17487 
17488     return atomicSupportedIfLegalIntType(RMW);
17489   }
17490   case AtomicRMWInst::Nand:
17491   case AtomicRMWInst::FSub:
17492   default:
17493     return AtomicExpansionKind::CmpXChg;
17494   }
17495 
17496   llvm_unreachable("covered atomicrmw op switch");
17497 }
17498 
17499 TargetLowering::AtomicExpansionKind
17500 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17501   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17502              ? AtomicExpansionKind::NotAtomic
17503              : AtomicExpansionKind::None;
17504 }
17505 
17506 TargetLowering::AtomicExpansionKind
17507 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17508   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17509              ? AtomicExpansionKind::NotAtomic
17510              : AtomicExpansionKind::None;
17511 }
17512 
17513 TargetLowering::AtomicExpansionKind
17514 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
17515   unsigned AddrSpace = CmpX->getPointerAddressSpace();
17516   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
17517     return AtomicExpansionKind::NotAtomic;
17518 
17519   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
17520     return AtomicExpansionKind::None;
17521 
17522   const DataLayout &DL = CmpX->getDataLayout();
17523 
17524   Type *ValTy = CmpX->getNewValOperand()->getType();
17525 
17526   // If a 64-bit flat atomic may alias private, we need to avoid using the
17527   // atomic in the private case.
17528   return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
17529                                            : AtomicExpansionKind::None;
17530 }
17531 
17532 const TargetRegisterClass *
17533 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
17534   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
17535   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17536   if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
17537     return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
17538                                  : &AMDGPU::SReg_32RegClass;
17539   if (!TRI->isSGPRClass(RC) && !isDivergent)
17540     return TRI->getEquivalentSGPRClass(RC);
17541   if (TRI->isSGPRClass(RC) && isDivergent)
17542     return TRI->getEquivalentVGPRClass(RC);
17543 
17544   return RC;
17545 }
17546 
17547 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
17548 // uniform values (as produced by the mask results of control flow intrinsics)
17549 // used outside of divergent blocks. The phi users need to also be treated as
17550 // always uniform.
17551 //
17552 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17553 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17554                       unsigned WaveSize) {
17555   // FIXME: We assume we never cast the mask results of a control flow
17556   // intrinsic.
17557   // Early exit if the type won't be consistent as a compile time hack.
17558   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
17559   if (!IT || IT->getBitWidth() != WaveSize)
17560     return false;
17561 
17562   if (!isa<Instruction>(V))
17563     return false;
17564   if (!Visited.insert(V).second)
17565     return false;
17566   bool Result = false;
17567   for (const auto *U : V->users()) {
17568     if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
17569       if (V == U->getOperand(1)) {
17570         switch (Intrinsic->getIntrinsicID()) {
17571         default:
17572           Result = false;
17573           break;
17574         case Intrinsic::amdgcn_if_break:
17575         case Intrinsic::amdgcn_if:
17576         case Intrinsic::amdgcn_else:
17577           Result = true;
17578           break;
17579         }
17580       }
17581       if (V == U->getOperand(0)) {
17582         switch (Intrinsic->getIntrinsicID()) {
17583         default:
17584           Result = false;
17585           break;
17586         case Intrinsic::amdgcn_end_cf:
17587         case Intrinsic::amdgcn_loop:
17588           Result = true;
17589           break;
17590         }
17591       }
17592     } else {
17593       Result = hasCFUser(U, Visited, WaveSize);
17594     }
17595     if (Result)
17596       break;
17597   }
17598   return Result;
17599 }
17600 
17601 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
17602                                                const Value *V) const {
17603   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
17604     if (CI->isInlineAsm()) {
17605       // FIXME: This cannot give a correct answer. This should only trigger in
17606       // the case where inline asm returns mixed SGPR and VGPR results, used
17607       // outside the defining block. We don't have a specific result to
17608       // consider, so this assumes if any value is SGPR, the overall register
17609       // also needs to be SGPR.
17610       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17611       TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
17612           MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
17613       for (auto &TC : TargetConstraints) {
17614         if (TC.Type == InlineAsm::isOutput) {
17615           ComputeConstraintToUse(TC, SDValue());
17616           const TargetRegisterClass *RC =
17617               getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
17618                                            TC.ConstraintVT)
17619                   .second;
17620           if (RC && SIRI->isSGPRClass(RC))
17621             return true;
17622         }
17623       }
17624     }
17625   }
17626   SmallPtrSet<const Value *, 16> Visited;
17627   return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
17628 }
17629 
17630 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
17631   for (SDUse &Use : N->uses()) {
17632     if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
17633       if (getBasePtrIndex(M) == Use.getOperandNo())
17634         return true;
17635     }
17636   }
17637   return false;
17638 }
17639 
17640 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
17641                                            SDValue N1) const {
17642   if (!N0.hasOneUse())
17643     return false;
17644   // Take care of the opportunity to keep N0 uniform
17645   if (N0->isDivergent() || !N1->isDivergent())
17646     return true;
17647   // Check if we have a good chance to form the memory access pattern with the
17648   // base and offset
17649   return (DAG.isBaseWithConstantOffset(N0) &&
17650           hasMemSDNodeUser(*N0->user_begin()));
17651 }
17652 
17653 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
17654                                            Register N0, Register N1) const {
17655   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
17656 }
17657 
17658 MachineMemOperand::Flags
17659 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
17660   // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17661   MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
17662   if (I.getMetadata("amdgpu.noclobber"))
17663     Flags |= MONoClobber;
17664   if (I.getMetadata("amdgpu.last.use"))
17665     Flags |= MOLastUse;
17666   return Flags;
17667 }
17668 
17669 bool SITargetLowering::checkForPhysRegDependency(
17670     SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17671     const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
17672   if (User->getOpcode() != ISD::CopyToReg)
17673     return false;
17674   if (!Def->isMachineOpcode())
17675     return false;
17676   MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
17677   if (!MDef)
17678     return false;
17679 
17680   unsigned ResNo = User->getOperand(Op).getResNo();
17681   if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
17682     return false;
17683   const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
17684   if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
17685     PhysReg = AMDGPU::SCC;
17686     const TargetRegisterClass *RC =
17687         TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
17688     Cost = RC->getCopyCost();
17689     return true;
17690   }
17691   return false;
17692 }
17693 
17694 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
17695     Instruction *AI) const {
17696   // Given: atomicrmw fadd ptr %addr, float %val ordering
17697   //
17698   // With this expansion we produce the following code:
17699   //   [...]
17700   //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17701   //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17702   //
17703   // atomicrmw.shared:
17704   //   %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17705   //   %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17706   //                                   float %val ordering
17707   //   br label %atomicrmw.phi
17708   //
17709   // atomicrmw.check.private:
17710   //   %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17711   //   br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17712   //
17713   // atomicrmw.private:
17714   //   %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17715   //   %loaded.private = load float, ptr addrspace(5) %cast.private
17716   //   %val.new = fadd float %loaded.private, %val
17717   //   store float %val.new, ptr addrspace(5) %cast.private
17718   //   br label %atomicrmw.phi
17719   //
17720   // atomicrmw.global:
17721   //   %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17722   //   %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17723   //                                   float %val ordering
17724   //   br label %atomicrmw.phi
17725   //
17726   // atomicrmw.phi:
17727   //   %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17728   //                           [ %loaded.private, %atomicrmw.private ],
17729   //                           [ %loaded.global, %atomicrmw.global ]
17730   //   br label %atomicrmw.end
17731   //
17732   // atomicrmw.end:
17733   //    [...]
17734   //
17735   //
17736   // For 64-bit atomics which may reside in private memory, we perform a simpler
17737   // version that only inserts the private check, and uses the flat operation.
17738 
17739   IRBuilder<> Builder(AI);
17740   LLVMContext &Ctx = Builder.getContext();
17741 
17742   auto *RMW = dyn_cast<AtomicRMWInst>(AI);
17743   const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17744                                 : AtomicCmpXchgInst::getPointerOperandIndex();
17745   Value *Addr = AI->getOperand(PtrOpIdx);
17746 
17747   /// TODO: Only need to check private, then emit flat-known-not private (no
17748   /// need for shared block, or cast to global).
17749   AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
17750 
17751   Align Alignment;
17752   if (RMW)
17753     Alignment = RMW->getAlign();
17754   else if (CX)
17755     Alignment = CX->getAlign();
17756   else
17757     llvm_unreachable("unhandled atomic operation");
17758 
17759   // FullFlatEmulation is true if we need to issue the private, shared, and
17760   // global cases.
17761   //
17762   // If this is false, we are only dealing with the flat-targeting-private case,
17763   // where we only insert a check for private and still use the flat instruction
17764   // for global and shared.
17765 
17766   bool FullFlatEmulation =
17767       RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17768       ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
17769        (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
17770         RMW->getType()->isDoubleTy()));
17771 
17772   // If the return value isn't used, do not introduce a false use in the phi.
17773   bool ReturnValueIsUsed = !AI->use_empty();
17774 
17775   BasicBlock *BB = Builder.GetInsertBlock();
17776   Function *F = BB->getParent();
17777   BasicBlock *ExitBB =
17778       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
17779   BasicBlock *SharedBB = nullptr;
17780 
17781   BasicBlock *CheckPrivateBB = BB;
17782   if (FullFlatEmulation) {
17783     SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
17784     CheckPrivateBB =
17785         BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
17786   }
17787 
17788   BasicBlock *PrivateBB =
17789       BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
17790   BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
17791   BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
17792 
17793   std::prev(BB->end())->eraseFromParent();
17794   Builder.SetInsertPoint(BB);
17795 
17796   Value *LoadedShared = nullptr;
17797   if (FullFlatEmulation) {
17798     CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
17799                                                  {Addr}, nullptr, "is.shared");
17800     Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
17801     Builder.SetInsertPoint(SharedBB);
17802     Value *CastToLocal = Builder.CreateAddrSpaceCast(
17803         Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
17804 
17805     Instruction *Clone = AI->clone();
17806     Clone->insertInto(SharedBB, SharedBB->end());
17807     Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
17808     LoadedShared = Clone;
17809 
17810     Builder.CreateBr(PhiBB);
17811     Builder.SetInsertPoint(CheckPrivateBB);
17812   }
17813 
17814   CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
17815                                                 {Addr}, nullptr, "is.private");
17816   Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
17817 
17818   Builder.SetInsertPoint(PrivateBB);
17819 
17820   Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17821       Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
17822 
17823   Value *LoadedPrivate;
17824   if (RMW) {
17825     LoadedPrivate = Builder.CreateAlignedLoad(
17826         RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
17827 
17828     Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
17829                                         LoadedPrivate, RMW->getValOperand());
17830 
17831     Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
17832   } else {
17833     auto [ResultLoad, Equal] =
17834         buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
17835                           CX->getNewValOperand(), CX->getAlign());
17836 
17837     Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
17838                                               ResultLoad, 0);
17839     LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
17840   }
17841 
17842   Builder.CreateBr(PhiBB);
17843 
17844   Builder.SetInsertPoint(GlobalBB);
17845 
17846   // Continue using a flat instruction if we only emitted the check for private.
17847   Instruction *LoadedGlobal = AI;
17848   if (FullFlatEmulation) {
17849     Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17850         Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
17851     AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
17852   }
17853 
17854   AI->removeFromParent();
17855   AI->insertInto(GlobalBB, GlobalBB->end());
17856 
17857   // The new atomicrmw may go through another round of legalization later.
17858   if (!FullFlatEmulation) {
17859     // We inserted the runtime check already, make sure we do not try to
17860     // re-expand this.
17861     // TODO: Should union with any existing metadata.
17862     MDBuilder MDB(F->getContext());
17863     MDNode *RangeNotPrivate =
17864         MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
17865                         APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
17866     LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
17867                               RangeNotPrivate);
17868   }
17869 
17870   Builder.CreateBr(PhiBB);
17871 
17872   Builder.SetInsertPoint(PhiBB);
17873 
17874   if (ReturnValueIsUsed) {
17875     PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
17876     AI->replaceAllUsesWith(Loaded);
17877     if (FullFlatEmulation)
17878       Loaded->addIncoming(LoadedShared, SharedBB);
17879     Loaded->addIncoming(LoadedPrivate, PrivateBB);
17880     Loaded->addIncoming(LoadedGlobal, GlobalBB);
17881     Loaded->takeName(AI);
17882   }
17883 
17884   Builder.CreateBr(ExitBB);
17885 }
17886 
17887 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
17888   AtomicRMWInst::BinOp Op = AI->getOperation();
17889 
17890   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17891       Op == AtomicRMWInst::Xor) {
17892     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
17893         ConstVal && ConstVal->isNullValue()) {
17894       // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17895       AI->setOperation(AtomicRMWInst::Add);
17896 
17897       // We may still need the private-alias-flat handling below.
17898 
17899       // TODO: Skip this for cases where we cannot access remote memory.
17900     }
17901   }
17902 
17903   // The non-flat expansions should only perform the de-canonicalization of
17904   // identity values.
17905   if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
17906     return;
17907 
17908   emitExpandAtomicAddrSpacePredicate(AI);
17909 }
17910 
17911 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17912   emitExpandAtomicAddrSpacePredicate(CI);
17913 }
17914 
17915 LoadInst *
17916 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
17917   IRBuilder<> Builder(AI);
17918   auto Order = AI->getOrdering();
17919 
17920   // The optimization removes store aspect of the atomicrmw. Therefore, cache
17921   // must be flushed if the atomic ordering had a release semantics. This is
17922   // not necessary a fence, a release fence just coincides to do that flush.
17923   // Avoid replacing of an atomicrmw with a release semantics.
17924   if (isReleaseOrStronger(Order))
17925     return nullptr;
17926 
17927   LoadInst *LI = Builder.CreateAlignedLoad(
17928       AI->getType(), AI->getPointerOperand(), AI->getAlign());
17929   LI->setAtomic(Order, AI->getSyncScopeID());
17930   LI->copyMetadata(*AI);
17931   LI->takeName(AI);
17932   AI->replaceAllUsesWith(LI);
17933   AI->eraseFromParent();
17934   return LI;
17935 }
17936