xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/CodeGen/ISDOpcodes.h"
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Instructions.h"
20 #include "llvm/IR/IntrinsicsNVPTX.h"
21 #include "llvm/Support/AtomicOrdering.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/raw_ostream.h"
26 #include "llvm/Target/TargetIntrinsicInfo.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "nvptx-isel"
31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32 
33 static cl::opt<bool>
34     EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
35                    cl::desc("Enable reciprocal sqrt optimization"));
36 
37 /// createNVPTXISelDag - This pass converts a legalized DAG into a
38 /// NVPTX-specific DAG, ready for instruction scheduling.
39 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
40                                        llvm::CodeGenOptLevel OptLevel) {
41   return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
42 }
43 
44 NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,
45                                                  CodeGenOptLevel OptLevel)
46     : SelectionDAGISelLegacy(
47           ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
48 
49 char NVPTXDAGToDAGISelLegacy::ID = 0;
50 
51 INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
52 
53 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
54                                      CodeGenOptLevel OptLevel)
55     : SelectionDAGISel(tm, OptLevel), TM(tm) {
56   doMulWide = (OptLevel > CodeGenOptLevel::None);
57 }
58 
59 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
60   Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
61   return SelectionDAGISel::runOnMachineFunction(MF);
62 }
63 
64 int NVPTXDAGToDAGISel::getDivF32Level() const {
65   return Subtarget->getTargetLowering()->getDivF32Level();
66 }
67 
68 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
69   return Subtarget->getTargetLowering()->usePrecSqrtF32();
70 }
71 
72 bool NVPTXDAGToDAGISel::useF32FTZ() const {
73   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
74 }
75 
76 bool NVPTXDAGToDAGISel::allowFMA() const {
77   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
78   return TL->allowFMA(*MF, OptLevel);
79 }
80 
81 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
82   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
83   return TL->allowUnsafeFPMath(*MF);
84 }
85 
86 bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
87 
88 /// Select - Select instructions not customized! Used for
89 /// expanded, promoted and normal instructions.
90 void NVPTXDAGToDAGISel::Select(SDNode *N) {
91 
92   if (N->isMachineOpcode()) {
93     N->setNodeId(-1);
94     return; // Already selected.
95   }
96 
97   switch (N->getOpcode()) {
98   case ISD::LOAD:
99   case ISD::ATOMIC_LOAD:
100     if (tryLoad(N))
101       return;
102     break;
103   case ISD::STORE:
104   case ISD::ATOMIC_STORE:
105     if (tryStore(N))
106       return;
107     break;
108   case ISD::EXTRACT_VECTOR_ELT:
109     if (tryEXTRACT_VECTOR_ELEMENT(N))
110       return;
111     break;
112   case NVPTXISD::SETP_F16X2:
113     SelectSETP_F16X2(N);
114     return;
115   case NVPTXISD::SETP_BF16X2:
116     SelectSETP_BF16X2(N);
117     return;
118   case NVPTXISD::LoadV2:
119   case NVPTXISD::LoadV4:
120     if (tryLoadVector(N))
121       return;
122     break;
123   case NVPTXISD::LDGV2:
124   case NVPTXISD::LDGV4:
125   case NVPTXISD::LDUV2:
126   case NVPTXISD::LDUV4:
127     if (tryLDGLDU(N))
128       return;
129     break;
130   case NVPTXISD::StoreV2:
131   case NVPTXISD::StoreV4:
132     if (tryStoreVector(N))
133       return;
134     break;
135   case NVPTXISD::LoadParam:
136   case NVPTXISD::LoadParamV2:
137   case NVPTXISD::LoadParamV4:
138     if (tryLoadParam(N))
139       return;
140     break;
141   case NVPTXISD::StoreRetval:
142   case NVPTXISD::StoreRetvalV2:
143   case NVPTXISD::StoreRetvalV4:
144     if (tryStoreRetval(N))
145       return;
146     break;
147   case NVPTXISD::StoreParam:
148   case NVPTXISD::StoreParamV2:
149   case NVPTXISD::StoreParamV4:
150   case NVPTXISD::StoreParamS32:
151   case NVPTXISD::StoreParamU32:
152     if (tryStoreParam(N))
153       return;
154     break;
155   case ISD::INTRINSIC_WO_CHAIN:
156     if (tryIntrinsicNoChain(N))
157       return;
158     break;
159   case ISD::INTRINSIC_W_CHAIN:
160     if (tryIntrinsicChain(N))
161       return;
162     break;
163   case NVPTXISD::Tex1DFloatS32:
164   case NVPTXISD::Tex1DFloatFloat:
165   case NVPTXISD::Tex1DFloatFloatLevel:
166   case NVPTXISD::Tex1DFloatFloatGrad:
167   case NVPTXISD::Tex1DS32S32:
168   case NVPTXISD::Tex1DS32Float:
169   case NVPTXISD::Tex1DS32FloatLevel:
170   case NVPTXISD::Tex1DS32FloatGrad:
171   case NVPTXISD::Tex1DU32S32:
172   case NVPTXISD::Tex1DU32Float:
173   case NVPTXISD::Tex1DU32FloatLevel:
174   case NVPTXISD::Tex1DU32FloatGrad:
175   case NVPTXISD::Tex1DArrayFloatS32:
176   case NVPTXISD::Tex1DArrayFloatFloat:
177   case NVPTXISD::Tex1DArrayFloatFloatLevel:
178   case NVPTXISD::Tex1DArrayFloatFloatGrad:
179   case NVPTXISD::Tex1DArrayS32S32:
180   case NVPTXISD::Tex1DArrayS32Float:
181   case NVPTXISD::Tex1DArrayS32FloatLevel:
182   case NVPTXISD::Tex1DArrayS32FloatGrad:
183   case NVPTXISD::Tex1DArrayU32S32:
184   case NVPTXISD::Tex1DArrayU32Float:
185   case NVPTXISD::Tex1DArrayU32FloatLevel:
186   case NVPTXISD::Tex1DArrayU32FloatGrad:
187   case NVPTXISD::Tex2DFloatS32:
188   case NVPTXISD::Tex2DFloatFloat:
189   case NVPTXISD::Tex2DFloatFloatLevel:
190   case NVPTXISD::Tex2DFloatFloatGrad:
191   case NVPTXISD::Tex2DS32S32:
192   case NVPTXISD::Tex2DS32Float:
193   case NVPTXISD::Tex2DS32FloatLevel:
194   case NVPTXISD::Tex2DS32FloatGrad:
195   case NVPTXISD::Tex2DU32S32:
196   case NVPTXISD::Tex2DU32Float:
197   case NVPTXISD::Tex2DU32FloatLevel:
198   case NVPTXISD::Tex2DU32FloatGrad:
199   case NVPTXISD::Tex2DArrayFloatS32:
200   case NVPTXISD::Tex2DArrayFloatFloat:
201   case NVPTXISD::Tex2DArrayFloatFloatLevel:
202   case NVPTXISD::Tex2DArrayFloatFloatGrad:
203   case NVPTXISD::Tex2DArrayS32S32:
204   case NVPTXISD::Tex2DArrayS32Float:
205   case NVPTXISD::Tex2DArrayS32FloatLevel:
206   case NVPTXISD::Tex2DArrayS32FloatGrad:
207   case NVPTXISD::Tex2DArrayU32S32:
208   case NVPTXISD::Tex2DArrayU32Float:
209   case NVPTXISD::Tex2DArrayU32FloatLevel:
210   case NVPTXISD::Tex2DArrayU32FloatGrad:
211   case NVPTXISD::Tex3DFloatS32:
212   case NVPTXISD::Tex3DFloatFloat:
213   case NVPTXISD::Tex3DFloatFloatLevel:
214   case NVPTXISD::Tex3DFloatFloatGrad:
215   case NVPTXISD::Tex3DS32S32:
216   case NVPTXISD::Tex3DS32Float:
217   case NVPTXISD::Tex3DS32FloatLevel:
218   case NVPTXISD::Tex3DS32FloatGrad:
219   case NVPTXISD::Tex3DU32S32:
220   case NVPTXISD::Tex3DU32Float:
221   case NVPTXISD::Tex3DU32FloatLevel:
222   case NVPTXISD::Tex3DU32FloatGrad:
223   case NVPTXISD::TexCubeFloatFloat:
224   case NVPTXISD::TexCubeFloatFloatLevel:
225   case NVPTXISD::TexCubeS32Float:
226   case NVPTXISD::TexCubeS32FloatLevel:
227   case NVPTXISD::TexCubeU32Float:
228   case NVPTXISD::TexCubeU32FloatLevel:
229   case NVPTXISD::TexCubeArrayFloatFloat:
230   case NVPTXISD::TexCubeArrayFloatFloatLevel:
231   case NVPTXISD::TexCubeArrayS32Float:
232   case NVPTXISD::TexCubeArrayS32FloatLevel:
233   case NVPTXISD::TexCubeArrayU32Float:
234   case NVPTXISD::TexCubeArrayU32FloatLevel:
235   case NVPTXISD::Tld4R2DFloatFloat:
236   case NVPTXISD::Tld4G2DFloatFloat:
237   case NVPTXISD::Tld4B2DFloatFloat:
238   case NVPTXISD::Tld4A2DFloatFloat:
239   case NVPTXISD::Tld4R2DS64Float:
240   case NVPTXISD::Tld4G2DS64Float:
241   case NVPTXISD::Tld4B2DS64Float:
242   case NVPTXISD::Tld4A2DS64Float:
243   case NVPTXISD::Tld4R2DU64Float:
244   case NVPTXISD::Tld4G2DU64Float:
245   case NVPTXISD::Tld4B2DU64Float:
246   case NVPTXISD::Tld4A2DU64Float:
247   case NVPTXISD::TexUnified1DFloatS32:
248   case NVPTXISD::TexUnified1DFloatFloat:
249   case NVPTXISD::TexUnified1DFloatFloatLevel:
250   case NVPTXISD::TexUnified1DFloatFloatGrad:
251   case NVPTXISD::TexUnified1DS32S32:
252   case NVPTXISD::TexUnified1DS32Float:
253   case NVPTXISD::TexUnified1DS32FloatLevel:
254   case NVPTXISD::TexUnified1DS32FloatGrad:
255   case NVPTXISD::TexUnified1DU32S32:
256   case NVPTXISD::TexUnified1DU32Float:
257   case NVPTXISD::TexUnified1DU32FloatLevel:
258   case NVPTXISD::TexUnified1DU32FloatGrad:
259   case NVPTXISD::TexUnified1DArrayFloatS32:
260   case NVPTXISD::TexUnified1DArrayFloatFloat:
261   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
262   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
263   case NVPTXISD::TexUnified1DArrayS32S32:
264   case NVPTXISD::TexUnified1DArrayS32Float:
265   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
266   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
267   case NVPTXISD::TexUnified1DArrayU32S32:
268   case NVPTXISD::TexUnified1DArrayU32Float:
269   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
270   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
271   case NVPTXISD::TexUnified2DFloatS32:
272   case NVPTXISD::TexUnified2DFloatFloat:
273   case NVPTXISD::TexUnified2DFloatFloatLevel:
274   case NVPTXISD::TexUnified2DFloatFloatGrad:
275   case NVPTXISD::TexUnified2DS32S32:
276   case NVPTXISD::TexUnified2DS32Float:
277   case NVPTXISD::TexUnified2DS32FloatLevel:
278   case NVPTXISD::TexUnified2DS32FloatGrad:
279   case NVPTXISD::TexUnified2DU32S32:
280   case NVPTXISD::TexUnified2DU32Float:
281   case NVPTXISD::TexUnified2DU32FloatLevel:
282   case NVPTXISD::TexUnified2DU32FloatGrad:
283   case NVPTXISD::TexUnified2DArrayFloatS32:
284   case NVPTXISD::TexUnified2DArrayFloatFloat:
285   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
286   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
287   case NVPTXISD::TexUnified2DArrayS32S32:
288   case NVPTXISD::TexUnified2DArrayS32Float:
289   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
290   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
291   case NVPTXISD::TexUnified2DArrayU32S32:
292   case NVPTXISD::TexUnified2DArrayU32Float:
293   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
294   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
295   case NVPTXISD::TexUnified3DFloatS32:
296   case NVPTXISD::TexUnified3DFloatFloat:
297   case NVPTXISD::TexUnified3DFloatFloatLevel:
298   case NVPTXISD::TexUnified3DFloatFloatGrad:
299   case NVPTXISD::TexUnified3DS32S32:
300   case NVPTXISD::TexUnified3DS32Float:
301   case NVPTXISD::TexUnified3DS32FloatLevel:
302   case NVPTXISD::TexUnified3DS32FloatGrad:
303   case NVPTXISD::TexUnified3DU32S32:
304   case NVPTXISD::TexUnified3DU32Float:
305   case NVPTXISD::TexUnified3DU32FloatLevel:
306   case NVPTXISD::TexUnified3DU32FloatGrad:
307   case NVPTXISD::TexUnifiedCubeFloatFloat:
308   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
309   case NVPTXISD::TexUnifiedCubeS32Float:
310   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
311   case NVPTXISD::TexUnifiedCubeU32Float:
312   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
313   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
314   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
315   case NVPTXISD::TexUnifiedCubeArrayS32Float:
316   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
317   case NVPTXISD::TexUnifiedCubeArrayU32Float:
318   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
319   case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
320   case NVPTXISD::TexUnifiedCubeS32FloatGrad:
321   case NVPTXISD::TexUnifiedCubeU32FloatGrad:
322   case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
323   case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
324   case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
325   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
326   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
327   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
328   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
329   case NVPTXISD::Tld4UnifiedR2DS64Float:
330   case NVPTXISD::Tld4UnifiedG2DS64Float:
331   case NVPTXISD::Tld4UnifiedB2DS64Float:
332   case NVPTXISD::Tld4UnifiedA2DS64Float:
333   case NVPTXISD::Tld4UnifiedR2DU64Float:
334   case NVPTXISD::Tld4UnifiedG2DU64Float:
335   case NVPTXISD::Tld4UnifiedB2DU64Float:
336   case NVPTXISD::Tld4UnifiedA2DU64Float:
337     if (tryTextureIntrinsic(N))
338       return;
339     break;
340   case NVPTXISD::Suld1DI8Clamp:
341   case NVPTXISD::Suld1DI16Clamp:
342   case NVPTXISD::Suld1DI32Clamp:
343   case NVPTXISD::Suld1DI64Clamp:
344   case NVPTXISD::Suld1DV2I8Clamp:
345   case NVPTXISD::Suld1DV2I16Clamp:
346   case NVPTXISD::Suld1DV2I32Clamp:
347   case NVPTXISD::Suld1DV2I64Clamp:
348   case NVPTXISD::Suld1DV4I8Clamp:
349   case NVPTXISD::Suld1DV4I16Clamp:
350   case NVPTXISD::Suld1DV4I32Clamp:
351   case NVPTXISD::Suld1DArrayI8Clamp:
352   case NVPTXISD::Suld1DArrayI16Clamp:
353   case NVPTXISD::Suld1DArrayI32Clamp:
354   case NVPTXISD::Suld1DArrayI64Clamp:
355   case NVPTXISD::Suld1DArrayV2I8Clamp:
356   case NVPTXISD::Suld1DArrayV2I16Clamp:
357   case NVPTXISD::Suld1DArrayV2I32Clamp:
358   case NVPTXISD::Suld1DArrayV2I64Clamp:
359   case NVPTXISD::Suld1DArrayV4I8Clamp:
360   case NVPTXISD::Suld1DArrayV4I16Clamp:
361   case NVPTXISD::Suld1DArrayV4I32Clamp:
362   case NVPTXISD::Suld2DI8Clamp:
363   case NVPTXISD::Suld2DI16Clamp:
364   case NVPTXISD::Suld2DI32Clamp:
365   case NVPTXISD::Suld2DI64Clamp:
366   case NVPTXISD::Suld2DV2I8Clamp:
367   case NVPTXISD::Suld2DV2I16Clamp:
368   case NVPTXISD::Suld2DV2I32Clamp:
369   case NVPTXISD::Suld2DV2I64Clamp:
370   case NVPTXISD::Suld2DV4I8Clamp:
371   case NVPTXISD::Suld2DV4I16Clamp:
372   case NVPTXISD::Suld2DV4I32Clamp:
373   case NVPTXISD::Suld2DArrayI8Clamp:
374   case NVPTXISD::Suld2DArrayI16Clamp:
375   case NVPTXISD::Suld2DArrayI32Clamp:
376   case NVPTXISD::Suld2DArrayI64Clamp:
377   case NVPTXISD::Suld2DArrayV2I8Clamp:
378   case NVPTXISD::Suld2DArrayV2I16Clamp:
379   case NVPTXISD::Suld2DArrayV2I32Clamp:
380   case NVPTXISD::Suld2DArrayV2I64Clamp:
381   case NVPTXISD::Suld2DArrayV4I8Clamp:
382   case NVPTXISD::Suld2DArrayV4I16Clamp:
383   case NVPTXISD::Suld2DArrayV4I32Clamp:
384   case NVPTXISD::Suld3DI8Clamp:
385   case NVPTXISD::Suld3DI16Clamp:
386   case NVPTXISD::Suld3DI32Clamp:
387   case NVPTXISD::Suld3DI64Clamp:
388   case NVPTXISD::Suld3DV2I8Clamp:
389   case NVPTXISD::Suld3DV2I16Clamp:
390   case NVPTXISD::Suld3DV2I32Clamp:
391   case NVPTXISD::Suld3DV2I64Clamp:
392   case NVPTXISD::Suld3DV4I8Clamp:
393   case NVPTXISD::Suld3DV4I16Clamp:
394   case NVPTXISD::Suld3DV4I32Clamp:
395   case NVPTXISD::Suld1DI8Trap:
396   case NVPTXISD::Suld1DI16Trap:
397   case NVPTXISD::Suld1DI32Trap:
398   case NVPTXISD::Suld1DI64Trap:
399   case NVPTXISD::Suld1DV2I8Trap:
400   case NVPTXISD::Suld1DV2I16Trap:
401   case NVPTXISD::Suld1DV2I32Trap:
402   case NVPTXISD::Suld1DV2I64Trap:
403   case NVPTXISD::Suld1DV4I8Trap:
404   case NVPTXISD::Suld1DV4I16Trap:
405   case NVPTXISD::Suld1DV4I32Trap:
406   case NVPTXISD::Suld1DArrayI8Trap:
407   case NVPTXISD::Suld1DArrayI16Trap:
408   case NVPTXISD::Suld1DArrayI32Trap:
409   case NVPTXISD::Suld1DArrayI64Trap:
410   case NVPTXISD::Suld1DArrayV2I8Trap:
411   case NVPTXISD::Suld1DArrayV2I16Trap:
412   case NVPTXISD::Suld1DArrayV2I32Trap:
413   case NVPTXISD::Suld1DArrayV2I64Trap:
414   case NVPTXISD::Suld1DArrayV4I8Trap:
415   case NVPTXISD::Suld1DArrayV4I16Trap:
416   case NVPTXISD::Suld1DArrayV4I32Trap:
417   case NVPTXISD::Suld2DI8Trap:
418   case NVPTXISD::Suld2DI16Trap:
419   case NVPTXISD::Suld2DI32Trap:
420   case NVPTXISD::Suld2DI64Trap:
421   case NVPTXISD::Suld2DV2I8Trap:
422   case NVPTXISD::Suld2DV2I16Trap:
423   case NVPTXISD::Suld2DV2I32Trap:
424   case NVPTXISD::Suld2DV2I64Trap:
425   case NVPTXISD::Suld2DV4I8Trap:
426   case NVPTXISD::Suld2DV4I16Trap:
427   case NVPTXISD::Suld2DV4I32Trap:
428   case NVPTXISD::Suld2DArrayI8Trap:
429   case NVPTXISD::Suld2DArrayI16Trap:
430   case NVPTXISD::Suld2DArrayI32Trap:
431   case NVPTXISD::Suld2DArrayI64Trap:
432   case NVPTXISD::Suld2DArrayV2I8Trap:
433   case NVPTXISD::Suld2DArrayV2I16Trap:
434   case NVPTXISD::Suld2DArrayV2I32Trap:
435   case NVPTXISD::Suld2DArrayV2I64Trap:
436   case NVPTXISD::Suld2DArrayV4I8Trap:
437   case NVPTXISD::Suld2DArrayV4I16Trap:
438   case NVPTXISD::Suld2DArrayV4I32Trap:
439   case NVPTXISD::Suld3DI8Trap:
440   case NVPTXISD::Suld3DI16Trap:
441   case NVPTXISD::Suld3DI32Trap:
442   case NVPTXISD::Suld3DI64Trap:
443   case NVPTXISD::Suld3DV2I8Trap:
444   case NVPTXISD::Suld3DV2I16Trap:
445   case NVPTXISD::Suld3DV2I32Trap:
446   case NVPTXISD::Suld3DV2I64Trap:
447   case NVPTXISD::Suld3DV4I8Trap:
448   case NVPTXISD::Suld3DV4I16Trap:
449   case NVPTXISD::Suld3DV4I32Trap:
450   case NVPTXISD::Suld1DI8Zero:
451   case NVPTXISD::Suld1DI16Zero:
452   case NVPTXISD::Suld1DI32Zero:
453   case NVPTXISD::Suld1DI64Zero:
454   case NVPTXISD::Suld1DV2I8Zero:
455   case NVPTXISD::Suld1DV2I16Zero:
456   case NVPTXISD::Suld1DV2I32Zero:
457   case NVPTXISD::Suld1DV2I64Zero:
458   case NVPTXISD::Suld1DV4I8Zero:
459   case NVPTXISD::Suld1DV4I16Zero:
460   case NVPTXISD::Suld1DV4I32Zero:
461   case NVPTXISD::Suld1DArrayI8Zero:
462   case NVPTXISD::Suld1DArrayI16Zero:
463   case NVPTXISD::Suld1DArrayI32Zero:
464   case NVPTXISD::Suld1DArrayI64Zero:
465   case NVPTXISD::Suld1DArrayV2I8Zero:
466   case NVPTXISD::Suld1DArrayV2I16Zero:
467   case NVPTXISD::Suld1DArrayV2I32Zero:
468   case NVPTXISD::Suld1DArrayV2I64Zero:
469   case NVPTXISD::Suld1DArrayV4I8Zero:
470   case NVPTXISD::Suld1DArrayV4I16Zero:
471   case NVPTXISD::Suld1DArrayV4I32Zero:
472   case NVPTXISD::Suld2DI8Zero:
473   case NVPTXISD::Suld2DI16Zero:
474   case NVPTXISD::Suld2DI32Zero:
475   case NVPTXISD::Suld2DI64Zero:
476   case NVPTXISD::Suld2DV2I8Zero:
477   case NVPTXISD::Suld2DV2I16Zero:
478   case NVPTXISD::Suld2DV2I32Zero:
479   case NVPTXISD::Suld2DV2I64Zero:
480   case NVPTXISD::Suld2DV4I8Zero:
481   case NVPTXISD::Suld2DV4I16Zero:
482   case NVPTXISD::Suld2DV4I32Zero:
483   case NVPTXISD::Suld2DArrayI8Zero:
484   case NVPTXISD::Suld2DArrayI16Zero:
485   case NVPTXISD::Suld2DArrayI32Zero:
486   case NVPTXISD::Suld2DArrayI64Zero:
487   case NVPTXISD::Suld2DArrayV2I8Zero:
488   case NVPTXISD::Suld2DArrayV2I16Zero:
489   case NVPTXISD::Suld2DArrayV2I32Zero:
490   case NVPTXISD::Suld2DArrayV2I64Zero:
491   case NVPTXISD::Suld2DArrayV4I8Zero:
492   case NVPTXISD::Suld2DArrayV4I16Zero:
493   case NVPTXISD::Suld2DArrayV4I32Zero:
494   case NVPTXISD::Suld3DI8Zero:
495   case NVPTXISD::Suld3DI16Zero:
496   case NVPTXISD::Suld3DI32Zero:
497   case NVPTXISD::Suld3DI64Zero:
498   case NVPTXISD::Suld3DV2I8Zero:
499   case NVPTXISD::Suld3DV2I16Zero:
500   case NVPTXISD::Suld3DV2I32Zero:
501   case NVPTXISD::Suld3DV2I64Zero:
502   case NVPTXISD::Suld3DV4I8Zero:
503   case NVPTXISD::Suld3DV4I16Zero:
504   case NVPTXISD::Suld3DV4I32Zero:
505     if (trySurfaceIntrinsic(N))
506       return;
507     break;
508   case ISD::AND:
509   case ISD::SRA:
510   case ISD::SRL:
511     // Try to select BFE
512     if (tryBFE(N))
513       return;
514     break;
515   case ISD::ADDRSPACECAST:
516     SelectAddrSpaceCast(N);
517     return;
518   case ISD::ConstantFP:
519     if (tryConstantFP(N))
520       return;
521     break;
522   case ISD::CopyToReg: {
523     if (N->getOperand(1).getValueType() == MVT::i128) {
524       SelectV2I64toI128(N);
525       return;
526     }
527     break;
528   }
529   case ISD::CopyFromReg: {
530     if (N->getOperand(1).getValueType() == MVT::i128) {
531       SelectI128toV2I64(N);
532       return;
533     }
534     break;
535   }
536   default:
537     break;
538   }
539   SelectCode(N);
540 }
541 
542 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
543   unsigned IID = N->getConstantOperandVal(1);
544   switch (IID) {
545   default:
546     return false;
547   case Intrinsic::nvvm_ldg_global_f:
548   case Intrinsic::nvvm_ldg_global_i:
549   case Intrinsic::nvvm_ldg_global_p:
550   case Intrinsic::nvvm_ldu_global_f:
551   case Intrinsic::nvvm_ldu_global_i:
552   case Intrinsic::nvvm_ldu_global_p:
553     return tryLDGLDU(N);
554   }
555 }
556 
557 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
558 // have to load them into an .(b)f16 register first.
559 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
560   if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
561     return false;
562   SDValue Val = CurDAG->getTargetConstantFP(
563       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
564   SDNode *LoadConstF16 = CurDAG->getMachineNode(
565       (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
566                                       : NVPTX::LOAD_CONST_BF16),
567       SDLoc(N), N->getValueType(0), Val);
568   ReplaceNode(N, LoadConstF16);
569   return true;
570 }
571 
572 // Map ISD:CONDCODE value to appropriate CmpMode expected by
573 // NVPTXInstPrinter::printCmpMode()
574 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
575   using NVPTX::PTXCmpMode::CmpMode;
576   unsigned PTXCmpMode = [](ISD::CondCode CC) {
577     switch (CC) {
578     default:
579       llvm_unreachable("Unexpected condition code.");
580     case ISD::SETOEQ:
581       return CmpMode::EQ;
582     case ISD::SETOGT:
583       return CmpMode::GT;
584     case ISD::SETOGE:
585       return CmpMode::GE;
586     case ISD::SETOLT:
587       return CmpMode::LT;
588     case ISD::SETOLE:
589       return CmpMode::LE;
590     case ISD::SETONE:
591       return CmpMode::NE;
592     case ISD::SETO:
593       return CmpMode::NUM;
594     case ISD::SETUO:
595       return CmpMode::NotANumber;
596     case ISD::SETUEQ:
597       return CmpMode::EQU;
598     case ISD::SETUGT:
599       return CmpMode::GTU;
600     case ISD::SETUGE:
601       return CmpMode::GEU;
602     case ISD::SETULT:
603       return CmpMode::LTU;
604     case ISD::SETULE:
605       return CmpMode::LEU;
606     case ISD::SETUNE:
607       return CmpMode::NEU;
608     case ISD::SETEQ:
609       return CmpMode::EQ;
610     case ISD::SETGT:
611       return CmpMode::GT;
612     case ISD::SETGE:
613       return CmpMode::GE;
614     case ISD::SETLT:
615       return CmpMode::LT;
616     case ISD::SETLE:
617       return CmpMode::LE;
618     case ISD::SETNE:
619       return CmpMode::NE;
620     }
621   }(CondCode.get());
622 
623   if (FTZ)
624     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
625 
626   return PTXCmpMode;
627 }
628 
629 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
630   unsigned PTXCmpMode =
631       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
632   SDLoc DL(N);
633   SDNode *SetP = CurDAG->getMachineNode(
634       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
635       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
636   ReplaceNode(N, SetP);
637   return true;
638 }
639 
640 bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
641   unsigned PTXCmpMode =
642       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
643   SDLoc DL(N);
644   SDNode *SetP = CurDAG->getMachineNode(
645       NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
646       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
647   ReplaceNode(N, SetP);
648   return true;
649 }
650 
651 // Find all instances of extract_vector_elt that use this v2f16 vector
652 // and coalesce them into a scattering move instruction.
653 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
654   SDValue Vector = N->getOperand(0);
655 
656   // We only care about 16x2 as it's the only real vector type we
657   // need to deal with.
658   MVT VT = Vector.getSimpleValueType();
659   if (!Isv2x16VT(VT))
660     return false;
661   // Find and record all uses of this vector that extract element 0 or 1.
662   SmallVector<SDNode *, 4> E0, E1;
663   for (auto *U : Vector.getNode()->uses()) {
664     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
665       continue;
666     if (U->getOperand(0) != Vector)
667       continue;
668     if (const ConstantSDNode *IdxConst =
669             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
670       if (IdxConst->getZExtValue() == 0)
671         E0.push_back(U);
672       else if (IdxConst->getZExtValue() == 1)
673         E1.push_back(U);
674       else
675         llvm_unreachable("Invalid vector index.");
676     }
677   }
678 
679   // There's no point scattering f16x2 if we only ever access one
680   // element of it.
681   if (E0.empty() || E1.empty())
682     return false;
683 
684   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
685   // into f16,f16 SplitF16x2(V)
686   MVT EltVT = VT.getVectorElementType();
687   SDNode *ScatterOp =
688       CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
689   for (auto *Node : E0)
690     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
691   for (auto *Node : E1)
692     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
693 
694   return true;
695 }
696 
697 static unsigned int getCodeAddrSpace(MemSDNode *N) {
698   const Value *Src = N->getMemOperand()->getValue();
699 
700   if (!Src)
701     return NVPTX::PTXLdStInstCode::GENERIC;
702 
703   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
704     switch (PT->getAddressSpace()) {
705     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
706     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
707     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
708     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
709     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
710     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
711     default: break;
712     }
713   }
714   return NVPTX::PTXLdStInstCode::GENERIC;
715 }
716 
717 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
718                           unsigned CodeAddrSpace, MachineFunction *F) {
719   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
720   // space.
721   //
722   // We have two ways of identifying invariant loads: Loads may be explicitly
723   // marked as invariant, or we may infer them to be invariant.
724   //
725   // We currently infer invariance for loads from
726   //  - constant global variables, and
727   //  - kernel function pointer params that are noalias (i.e. __restrict) and
728   //    never written to.
729   //
730   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
731   // not during the SelectionDAG phase).
732   //
733   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
734   // explicitly invariant loads because these are how clang tells us to use ldg
735   // when the user uses a builtin.
736   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
737     return false;
738 
739   if (N->isInvariant())
740     return true;
741 
742   bool IsKernelFn = isKernelFunction(F->getFunction());
743 
744   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
745   // because the former looks through phi nodes while the latter does not. We
746   // need to look through phi nodes to handle pointer induction variables.
747   SmallVector<const Value *, 8> Objs;
748   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
749 
750   return all_of(Objs, [&](const Value *V) {
751     if (auto *A = dyn_cast<const Argument>(V))
752       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
753     if (auto *GV = dyn_cast<const GlobalVariable>(V))
754       return GV->isConstant();
755     return false;
756   });
757 }
758 
759 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
760   unsigned IID = N->getConstantOperandVal(0);
761   switch (IID) {
762   default:
763     return false;
764   case Intrinsic::nvvm_texsurf_handle_internal:
765     SelectTexSurfHandle(N);
766     return true;
767   }
768 }
769 
770 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
771   // Op 0 is the intrinsic ID
772   SDValue Wrapper = N->getOperand(1);
773   SDValue GlobalVal = Wrapper.getOperand(0);
774   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
775                                         MVT::i64, GlobalVal));
776 }
777 
778 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
779   SDValue Src = N->getOperand(0);
780   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
781   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
782   unsigned DstAddrSpace = CastN->getDestAddressSpace();
783   assert(SrcAddrSpace != DstAddrSpace &&
784          "addrspacecast must be between different address spaces");
785 
786   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
787     // Specific to generic
788     unsigned Opc;
789     switch (SrcAddrSpace) {
790     default: report_fatal_error("Bad address space in addrspacecast");
791     case ADDRESS_SPACE_GLOBAL:
792       Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
793       break;
794     case ADDRESS_SPACE_SHARED:
795       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
796                                 ? NVPTX::cvta_shared_6432
797                                 : NVPTX::cvta_shared_64)
798                          : NVPTX::cvta_shared;
799       break;
800     case ADDRESS_SPACE_CONST:
801       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
802                                 ? NVPTX::cvta_const_6432
803                                 : NVPTX::cvta_const_64)
804                          : NVPTX::cvta_const;
805       break;
806     case ADDRESS_SPACE_LOCAL:
807       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
808                                 ? NVPTX::cvta_local_6432
809                                 : NVPTX::cvta_local_64)
810                          : NVPTX::cvta_local;
811       break;
812     }
813     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
814                                           Src));
815     return;
816   } else {
817     // Generic to specific
818     if (SrcAddrSpace != 0)
819       report_fatal_error("Cannot cast between two non-generic address spaces");
820     unsigned Opc;
821     switch (DstAddrSpace) {
822     default: report_fatal_error("Bad address space in addrspacecast");
823     case ADDRESS_SPACE_GLOBAL:
824       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
825       break;
826     case ADDRESS_SPACE_SHARED:
827       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
828                                 ? NVPTX::cvta_to_shared_3264
829                                 : NVPTX::cvta_to_shared_64)
830                          : NVPTX::cvta_to_shared;
831       break;
832     case ADDRESS_SPACE_CONST:
833       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
834                                 ? NVPTX::cvta_to_const_3264
835                                 : NVPTX::cvta_to_const_64)
836                          : NVPTX::cvta_to_const;
837       break;
838     case ADDRESS_SPACE_LOCAL:
839       Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
840                                 ? NVPTX::cvta_to_local_3264
841                                 : NVPTX::cvta_to_local_64)
842                          : NVPTX::cvta_to_local;
843       break;
844     case ADDRESS_SPACE_PARAM:
845       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
846                          : NVPTX::nvvm_ptr_gen_to_param;
847       break;
848     }
849     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
850                                           Src));
851     return;
852   }
853 }
854 
855 // Helper function template to reduce amount of boilerplate code for
856 // opcode selection.
857 static std::optional<unsigned>
858 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
859                 unsigned Opcode_i16, unsigned Opcode_i32,
860                 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
861                 std::optional<unsigned> Opcode_f64) {
862   switch (VT) {
863   case MVT::i1:
864   case MVT::i8:
865     return Opcode_i8;
866   case MVT::i16:
867     return Opcode_i16;
868   case MVT::i32:
869     return Opcode_i32;
870   case MVT::i64:
871     return Opcode_i64;
872   case MVT::f16:
873   case MVT::bf16:
874     return Opcode_i16;
875   case MVT::v2f16:
876   case MVT::v2bf16:
877   case MVT::v2i16:
878   case MVT::v4i8:
879     return Opcode_i32;
880   case MVT::f32:
881     return Opcode_f32;
882   case MVT::f64:
883     return Opcode_f64;
884   default:
885     return std::nullopt;
886   }
887 }
888 
889 static int getLdStRegType(EVT VT) {
890   if (VT.isFloatingPoint())
891     switch (VT.getSimpleVT().SimpleTy) {
892     case MVT::f16:
893     case MVT::bf16:
894     case MVT::v2f16:
895     case MVT::v2bf16:
896       return NVPTX::PTXLdStInstCode::Untyped;
897     default:
898       return NVPTX::PTXLdStInstCode::Float;
899     }
900   else
901     return NVPTX::PTXLdStInstCode::Unsigned;
902 }
903 
904 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
905   SDLoc dl(N);
906   MemSDNode *LD = cast<MemSDNode>(N);
907   assert(LD->readMem() && "Expected load");
908   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
909   EVT LoadedVT = LD->getMemoryVT();
910   SDNode *NVPTXLD = nullptr;
911 
912   // do not support pre/post inc/dec
913   if (PlainLoad && PlainLoad->isIndexed())
914     return false;
915 
916   if (!LoadedVT.isSimple())
917     return false;
918 
919   AtomicOrdering Ordering = LD->getSuccessOrdering();
920   // In order to lower atomic loads with stronger guarantees we would need to
921   // use load.acquire or insert fences. However these features were only added
922   // with PTX ISA 6.0 / sm_70.
923   // TODO: Check if we can actually use the new instructions and implement them.
924   if (isStrongerThanMonotonic(Ordering))
925     return false;
926 
927   // Address Space Setting
928   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
929   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
930     return tryLDGLDU(N);
931   }
932 
933   unsigned int PointerSize =
934       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
935 
936   // Volatile Setting
937   // - .volatile is only available for .global and .shared
938   // - .volatile has the same memory synchronization semantics as .relaxed.sys
939   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
940   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
941       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
942       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
943     isVolatile = false;
944 
945   // Type Setting: fromType + fromTypeWidth
946   //
947   // Sign   : ISD::SEXTLOAD
948   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
949   //          type is integer
950   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
951   MVT SimpleVT = LoadedVT.getSimpleVT();
952   MVT ScalarVT = SimpleVT.getScalarType();
953   // Read at least 8 bits (predicates are stored as 8-bit values)
954   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
955   unsigned int fromType;
956 
957   // Vector Setting
958   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
959   if (SimpleVT.isVector()) {
960     assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
961            "Unexpected vector type");
962     // v2f16/v2bf16/v2i16 is loaded using ld.b32
963     fromTypeWidth = 32;
964   }
965 
966   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
967     fromType = NVPTX::PTXLdStInstCode::Signed;
968   else
969     fromType = getLdStRegType(ScalarVT);
970 
971   // Create the machine instruction DAG
972   SDValue Chain = N->getOperand(0);
973   SDValue N1 = N->getOperand(1);
974   SDValue Addr;
975   SDValue Offset, Base;
976   std::optional<unsigned> Opcode;
977   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
978 
979   if (SelectDirectAddr(N1, Addr)) {
980     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
981                              NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
982                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
983     if (!Opcode)
984       return false;
985     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
986                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
987                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
988     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
989   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
990                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
991     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
992                              NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
993                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
994     if (!Opcode)
995       return false;
996     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
997                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
998                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
999     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1000   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
1001                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
1002     if (PointerSize == 64)
1003       Opcode =
1004           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
1005                           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
1006                           NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
1007     else
1008       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
1009                                NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
1010                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1011     if (!Opcode)
1012       return false;
1013     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1014                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1015                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
1016     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1017   } else {
1018     if (PointerSize == 64)
1019       Opcode =
1020           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1021                           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
1022                           NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
1023     else
1024       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
1025                                NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
1026                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1027     if (!Opcode)
1028       return false;
1029     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1030                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1031                       getI32Imm(fromTypeWidth, dl), N1, Chain };
1032     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1033   }
1034 
1035   if (!NVPTXLD)
1036     return false;
1037 
1038   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1039   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1040 
1041   ReplaceNode(N, NVPTXLD);
1042   return true;
1043 }
1044 
1045 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1046 
1047   SDValue Chain = N->getOperand(0);
1048   SDValue Op1 = N->getOperand(1);
1049   SDValue Addr, Offset, Base;
1050   std::optional<unsigned> Opcode;
1051   SDLoc DL(N);
1052   SDNode *LD;
1053   MemSDNode *MemSD = cast<MemSDNode>(N);
1054   EVT LoadedVT = MemSD->getMemoryVT();
1055 
1056   if (!LoadedVT.isSimple())
1057     return false;
1058 
1059   // Address Space Setting
1060   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1061   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1062     return tryLDGLDU(N);
1063   }
1064 
1065   unsigned int PointerSize =
1066       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1067 
1068   // Volatile Setting
1069   // - .volatile is only availalble for .global and .shared
1070   bool IsVolatile = MemSD->isVolatile();
1071   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1072       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1073       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1074     IsVolatile = false;
1075 
1076   // Vector Setting
1077   MVT SimpleVT = LoadedVT.getSimpleVT();
1078 
1079   // Type Setting: fromType + fromTypeWidth
1080   //
1081   // Sign   : ISD::SEXTLOAD
1082   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1083   //          type is integer
1084   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1085   MVT ScalarVT = SimpleVT.getScalarType();
1086   // Read at least 8 bits (predicates are stored as 8-bit values)
1087   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1088   unsigned int FromType;
1089   // The last operand holds the original LoadSDNode::getExtensionType() value
1090   unsigned ExtensionType = cast<ConstantSDNode>(
1091       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1092   if (ExtensionType == ISD::SEXTLOAD)
1093     FromType = NVPTX::PTXLdStInstCode::Signed;
1094   else
1095     FromType = getLdStRegType(ScalarVT);
1096 
1097   unsigned VecType;
1098 
1099   switch (N->getOpcode()) {
1100   case NVPTXISD::LoadV2:
1101     VecType = NVPTX::PTXLdStInstCode::V2;
1102     break;
1103   case NVPTXISD::LoadV4:
1104     VecType = NVPTX::PTXLdStInstCode::V4;
1105     break;
1106   default:
1107     return false;
1108   }
1109 
1110   EVT EltVT = N->getValueType(0);
1111 
1112   // v8x16 is a special case. PTX doesn't have ld.v8.16
1113   // instruction. Instead, we split the vector into v2x16 chunks and
1114   // load them with ld.v4.b32.
1115   if (Isv2x16VT(EltVT)) {
1116     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1117     EltVT = MVT::i32;
1118     FromType = NVPTX::PTXLdStInstCode::Untyped;
1119     FromTypeWidth = 32;
1120   }
1121 
1122   if (SelectDirectAddr(Op1, Addr)) {
1123     switch (N->getOpcode()) {
1124     default:
1125       return false;
1126     case NVPTXISD::LoadV2:
1127       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1128                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1129                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1130                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1131       break;
1132     case NVPTXISD::LoadV4:
1133       Opcode =
1134           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1135                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1136                           std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1137       break;
1138     }
1139     if (!Opcode)
1140       return false;
1141     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1142                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1143                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1144     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1145   } else if (PointerSize == 64
1146                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1147                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1148     switch (N->getOpcode()) {
1149     default:
1150       return false;
1151     case NVPTXISD::LoadV2:
1152       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1153                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1154                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1155                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1156       break;
1157     case NVPTXISD::LoadV4:
1158       Opcode =
1159           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1160                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1161                           std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1162       break;
1163     }
1164     if (!Opcode)
1165       return false;
1166     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1167                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1168                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1169     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1170   } else if (PointerSize == 64
1171                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1172                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1173     if (PointerSize == 64) {
1174       switch (N->getOpcode()) {
1175       default:
1176         return false;
1177       case NVPTXISD::LoadV2:
1178         Opcode =
1179             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1180                             NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1181                             NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1182                             NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1183         break;
1184       case NVPTXISD::LoadV4:
1185         Opcode = pickOpcodeForVT(
1186             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1187             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1188             NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1189         break;
1190       }
1191     } else {
1192       switch (N->getOpcode()) {
1193       default:
1194         return false;
1195       case NVPTXISD::LoadV2:
1196         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1197                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1198                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1199                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1200         break;
1201       case NVPTXISD::LoadV4:
1202         Opcode =
1203             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1204                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1205                             std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1206         break;
1207       }
1208     }
1209     if (!Opcode)
1210       return false;
1211     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1212                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1213                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1214 
1215     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1216   } else {
1217     if (PointerSize == 64) {
1218       switch (N->getOpcode()) {
1219       default:
1220         return false;
1221       case NVPTXISD::LoadV2:
1222         Opcode = pickOpcodeForVT(
1223             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1224             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1225             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1226             NVPTX::LDV_f64_v2_areg_64);
1227         break;
1228       case NVPTXISD::LoadV4:
1229         Opcode = pickOpcodeForVT(
1230             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1231             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1232             NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1233         break;
1234       }
1235     } else {
1236       switch (N->getOpcode()) {
1237       default:
1238         return false;
1239       case NVPTXISD::LoadV2:
1240         Opcode =
1241             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1242                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1243                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1244                             NVPTX::LDV_f64_v2_areg);
1245         break;
1246       case NVPTXISD::LoadV4:
1247         Opcode =
1248             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1249                             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1250                             std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1251         break;
1252       }
1253     }
1254     if (!Opcode)
1255       return false;
1256     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1257                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1258                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1259     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1260   }
1261 
1262   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1263   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1264 
1265   ReplaceNode(N, LD);
1266   return true;
1267 }
1268 
1269 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1270 
1271   SDValue Chain = N->getOperand(0);
1272   SDValue Op1;
1273   MemSDNode *Mem;
1274   bool IsLDG = true;
1275 
1276   // If this is an LDG intrinsic, the address is the third operand. If its an
1277   // LDG/LDU SD node (from custom vector handling), then its the second operand
1278   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1279     Op1 = N->getOperand(2);
1280     Mem = cast<MemIntrinsicSDNode>(N);
1281     unsigned IID = N->getConstantOperandVal(1);
1282     switch (IID) {
1283     default:
1284       return false;
1285     case Intrinsic::nvvm_ldg_global_f:
1286     case Intrinsic::nvvm_ldg_global_i:
1287     case Intrinsic::nvvm_ldg_global_p:
1288       IsLDG = true;
1289       break;
1290     case Intrinsic::nvvm_ldu_global_f:
1291     case Intrinsic::nvvm_ldu_global_i:
1292     case Intrinsic::nvvm_ldu_global_p:
1293       IsLDG = false;
1294       break;
1295     }
1296   } else {
1297     Op1 = N->getOperand(1);
1298     Mem = cast<MemSDNode>(N);
1299   }
1300 
1301   std::optional<unsigned> Opcode;
1302   SDLoc DL(N);
1303   SDNode *LD;
1304   SDValue Base, Offset, Addr;
1305   EVT OrigType = N->getValueType(0);
1306 
1307   EVT EltVT = Mem->getMemoryVT();
1308   unsigned NumElts = 1;
1309   if (EltVT.isVector()) {
1310     NumElts = EltVT.getVectorNumElements();
1311     EltVT = EltVT.getVectorElementType();
1312     // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1313     if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1314         (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1315         (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1316       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1317       EltVT = OrigType;
1318       NumElts /= 2;
1319     } else if (OrigType == MVT::v4i8) {
1320       EltVT = OrigType;
1321       NumElts = 1;
1322     }
1323   }
1324 
1325   // Build the "promoted" result VTList for the load. If we are really loading
1326   // i8s, then the return type will be promoted to i16 since we do not expose
1327   // 8-bit registers in NVPTX.
1328   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1329   SmallVector<EVT, 5> InstVTs;
1330   for (unsigned i = 0; i != NumElts; ++i) {
1331     InstVTs.push_back(NodeVT);
1332   }
1333   InstVTs.push_back(MVT::Other);
1334   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1335 
1336   if (SelectDirectAddr(Op1, Addr)) {
1337     switch (N->getOpcode()) {
1338     default:
1339       return false;
1340     case ISD::LOAD:
1341     case ISD::INTRINSIC_W_CHAIN:
1342       if (IsLDG)
1343         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1344                                  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1345                                  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1346                                  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1347                                  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1348                                  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1349                                  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1350       else
1351         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1352                                  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1353                                  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1354                                  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1355                                  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1356                                  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1357                                  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1358       break;
1359     case NVPTXISD::LoadV2:
1360     case NVPTXISD::LDGV2:
1361       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1362                                NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1363                                NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1364                                NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1365                                NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1366                                NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1367                                NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1368       break;
1369     case NVPTXISD::LDUV2:
1370       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1371                                NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1372                                NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1373                                NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1374                                NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1375                                NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1376                                NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1377       break;
1378     case NVPTXISD::LoadV4:
1379     case NVPTXISD::LDGV4:
1380       Opcode = pickOpcodeForVT(
1381           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1382           NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1383           NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1384           NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1385       break;
1386     case NVPTXISD::LDUV4:
1387       Opcode = pickOpcodeForVT(
1388           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1389           NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1390           NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1391           NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1392       break;
1393     }
1394     if (!Opcode)
1395       return false;
1396     SDValue Ops[] = { Addr, Chain };
1397     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1398   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1399                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1400     if (TM.is64Bit()) {
1401       switch (N->getOpcode()) {
1402       default:
1403         return false;
1404       case ISD::LOAD:
1405       case ISD::INTRINSIC_W_CHAIN:
1406         if (IsLDG)
1407           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1408                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1409                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1410                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1411                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1412                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1413                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1414         else
1415           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1416                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1417                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1418                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1419                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1420                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1421                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1422         break;
1423       case NVPTXISD::LoadV2:
1424       case NVPTXISD::LDGV2:
1425         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1426                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1427                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1428                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1429                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1430                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1431                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1432         break;
1433       case NVPTXISD::LDUV2:
1434         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1435                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1436                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1437                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1438                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1439                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1440                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1441         break;
1442       case NVPTXISD::LoadV4:
1443       case NVPTXISD::LDGV4:
1444         Opcode = pickOpcodeForVT(
1445             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1446             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1447             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1448             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1449         break;
1450       case NVPTXISD::LDUV4:
1451         Opcode = pickOpcodeForVT(
1452             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1453             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1454             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1455             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1456         break;
1457       }
1458     } else {
1459       switch (N->getOpcode()) {
1460       default:
1461         return false;
1462       case ISD::LOAD:
1463       case ISD::INTRINSIC_W_CHAIN:
1464         if (IsLDG)
1465           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1466                                    NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1467                                    NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1468                                    NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1469                                    NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1470                                    NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1471                                    NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1472         else
1473           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1474                                    NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1475                                    NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1476                                    NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1477                                    NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1478                                    NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1479                                    NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1480         break;
1481       case NVPTXISD::LoadV2:
1482       case NVPTXISD::LDGV2:
1483         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1484                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1485                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1486                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1487                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1488                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1489                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1490         break;
1491       case NVPTXISD::LDUV2:
1492         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1493                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1494                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1495                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1496                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1497                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1498                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1499         break;
1500       case NVPTXISD::LoadV4:
1501       case NVPTXISD::LDGV4:
1502         Opcode = pickOpcodeForVT(
1503             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1504             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1505             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1506             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1507         break;
1508       case NVPTXISD::LDUV4:
1509         Opcode = pickOpcodeForVT(
1510             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1511             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1512             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1513             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1514         break;
1515       }
1516     }
1517     if (!Opcode)
1518       return false;
1519     SDValue Ops[] = {Base, Offset, Chain};
1520     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1521   } else {
1522     if (TM.is64Bit()) {
1523       switch (N->getOpcode()) {
1524       default:
1525         return false;
1526       case ISD::LOAD:
1527       case ISD::INTRINSIC_W_CHAIN:
1528         if (IsLDG)
1529           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1530                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1531                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1532                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1533                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1534                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1535                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1536         else
1537           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1538                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1539                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1540                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1541                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1542                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1543                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1544         break;
1545       case NVPTXISD::LoadV2:
1546       case NVPTXISD::LDGV2:
1547         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1548                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1549                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1550                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1551                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1552                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1553                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1554         break;
1555       case NVPTXISD::LDUV2:
1556         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1557                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1558                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1559                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1560                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1561                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1562                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1563         break;
1564       case NVPTXISD::LoadV4:
1565       case NVPTXISD::LDGV4:
1566         Opcode = pickOpcodeForVT(
1567             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1568             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1569             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1570             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1571         break;
1572       case NVPTXISD::LDUV4:
1573         Opcode = pickOpcodeForVT(
1574             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1575             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1576             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1577             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1578         break;
1579       }
1580     } else {
1581       switch (N->getOpcode()) {
1582       default:
1583         return false;
1584       case ISD::LOAD:
1585       case ISD::INTRINSIC_W_CHAIN:
1586         if (IsLDG)
1587           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1588                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1589                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1590                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1591                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1592                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1593                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1594         else
1595           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1596                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1597                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1598                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1599                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1600                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1601                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1602         break;
1603       case NVPTXISD::LoadV2:
1604       case NVPTXISD::LDGV2:
1605         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1606                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1607                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1608                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1609                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1610                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1611                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1612         break;
1613       case NVPTXISD::LDUV2:
1614         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1616                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1617                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1618                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1619                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1620                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1621         break;
1622       case NVPTXISD::LoadV4:
1623       case NVPTXISD::LDGV4:
1624         Opcode = pickOpcodeForVT(
1625             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1626             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1627             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1628             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1629         break;
1630       case NVPTXISD::LDUV4:
1631         Opcode = pickOpcodeForVT(
1632             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1633             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1634             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1635             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1636         break;
1637       }
1638     }
1639     if (!Opcode)
1640       return false;
1641     SDValue Ops[] = { Op1, Chain };
1642     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1643   }
1644 
1645   // For automatic generation of LDG (through SelectLoad[Vector], not the
1646   // intrinsics), we may have an extending load like:
1647   //
1648   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1649   //
1650   // In this case, the matching logic above will select a load for the original
1651   // memory type (in this case, i8) and our types will not match (the node needs
1652   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1653   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1654   // CVT instruction. Ptxas should clean up any redundancies here.
1655 
1656   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1657 
1658   if (OrigType != EltVT &&
1659       (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1660     // We have an extending-load. The instruction we selected operates on the
1661     // smaller type, but the SDNode we are replacing has the larger type. We
1662     // need to emit a CVT to make the types match.
1663     unsigned CvtOpc =
1664         GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1665 
1666     // For each output value, apply the manual sign/zero-extension and make sure
1667     // all users of the load go through that CVT.
1668     for (unsigned i = 0; i != NumElts; ++i) {
1669       SDValue Res(LD, i);
1670       SDValue OrigVal(N, i);
1671 
1672       SDNode *CvtNode =
1673         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1674                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1675                                                          DL, MVT::i32));
1676       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1677     }
1678   }
1679 
1680   ReplaceNode(N, LD);
1681   return true;
1682 }
1683 
1684 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1685   SDLoc dl(N);
1686   MemSDNode *ST = cast<MemSDNode>(N);
1687   assert(ST->writeMem() && "Expected store");
1688   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1689   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1690   assert((PlainStore || AtomicStore) && "Expected store");
1691   EVT StoreVT = ST->getMemoryVT();
1692   SDNode *NVPTXST = nullptr;
1693 
1694   // do not support pre/post inc/dec
1695   if (PlainStore && PlainStore->isIndexed())
1696     return false;
1697 
1698   if (!StoreVT.isSimple())
1699     return false;
1700 
1701   AtomicOrdering Ordering = ST->getSuccessOrdering();
1702   // In order to lower atomic loads with stronger guarantees we would need to
1703   // use store.release or insert fences. However these features were only added
1704   // with PTX ISA 6.0 / sm_70.
1705   // TODO: Check if we can actually use the new instructions and implement them.
1706   if (isStrongerThanMonotonic(Ordering))
1707     return false;
1708 
1709   // Address Space Setting
1710   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1711   unsigned int PointerSize =
1712       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1713 
1714   // Volatile Setting
1715   // - .volatile is only available for .global and .shared
1716   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1717   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1718   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1719       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1720       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1721     isVolatile = false;
1722 
1723   // Vector Setting
1724   MVT SimpleVT = StoreVT.getSimpleVT();
1725   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1726 
1727   // Type Setting: toType + toTypeWidth
1728   // - for integer type, always use 'u'
1729   //
1730   MVT ScalarVT = SimpleVT.getScalarType();
1731   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1732   if (SimpleVT.isVector()) {
1733     assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1734            "Unexpected vector type");
1735     // v2x16 is stored using st.b32
1736     toTypeWidth = 32;
1737   }
1738 
1739   unsigned int toType = getLdStRegType(ScalarVT);
1740 
1741   // Create the machine instruction DAG
1742   SDValue Chain = ST->getChain();
1743   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1744   SDValue BasePtr = ST->getBasePtr();
1745   SDValue Addr;
1746   SDValue Offset, Base;
1747   std::optional<unsigned> Opcode;
1748   MVT::SimpleValueType SourceVT =
1749       Value.getNode()->getSimpleValueType(0).SimpleTy;
1750 
1751   if (SelectDirectAddr(BasePtr, Addr)) {
1752     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1753                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1754                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1755     if (!Opcode)
1756       return false;
1757     SDValue Ops[] = {Value,
1758                      getI32Imm(isVolatile, dl),
1759                      getI32Imm(CodeAddrSpace, dl),
1760                      getI32Imm(vecType, dl),
1761                      getI32Imm(toType, dl),
1762                      getI32Imm(toTypeWidth, dl),
1763                      Addr,
1764                      Chain};
1765     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1766   } else if (PointerSize == 64
1767                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1768                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1769     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1770                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1771                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1772     if (!Opcode)
1773       return false;
1774     SDValue Ops[] = {Value,
1775                      getI32Imm(isVolatile, dl),
1776                      getI32Imm(CodeAddrSpace, dl),
1777                      getI32Imm(vecType, dl),
1778                      getI32Imm(toType, dl),
1779                      getI32Imm(toTypeWidth, dl),
1780                      Base,
1781                      Offset,
1782                      Chain};
1783     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1784   } else if (PointerSize == 64
1785                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1786                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1787     if (PointerSize == 64)
1788       Opcode =
1789           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1790                           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1791                           NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1792     else
1793       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1794                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1795                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1796     if (!Opcode)
1797       return false;
1798 
1799     SDValue Ops[] = {Value,
1800                      getI32Imm(isVolatile, dl),
1801                      getI32Imm(CodeAddrSpace, dl),
1802                      getI32Imm(vecType, dl),
1803                      getI32Imm(toType, dl),
1804                      getI32Imm(toTypeWidth, dl),
1805                      Base,
1806                      Offset,
1807                      Chain};
1808     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1809   } else {
1810     if (PointerSize == 64)
1811       Opcode =
1812           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1813                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1814                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1815     else
1816       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1817                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1818                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1819     if (!Opcode)
1820       return false;
1821     SDValue Ops[] = {Value,
1822                      getI32Imm(isVolatile, dl),
1823                      getI32Imm(CodeAddrSpace, dl),
1824                      getI32Imm(vecType, dl),
1825                      getI32Imm(toType, dl),
1826                      getI32Imm(toTypeWidth, dl),
1827                      BasePtr,
1828                      Chain};
1829     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1830   }
1831 
1832   if (!NVPTXST)
1833     return false;
1834 
1835   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1836   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1837   ReplaceNode(N, NVPTXST);
1838   return true;
1839 }
1840 
1841 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1842   SDValue Chain = N->getOperand(0);
1843   SDValue Op1 = N->getOperand(1);
1844   SDValue Addr, Offset, Base;
1845   std::optional<unsigned> Opcode;
1846   SDLoc DL(N);
1847   SDNode *ST;
1848   EVT EltVT = Op1.getValueType();
1849   MemSDNode *MemSD = cast<MemSDNode>(N);
1850   EVT StoreVT = MemSD->getMemoryVT();
1851 
1852   // Address Space Setting
1853   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1854   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1855     report_fatal_error("Cannot store to pointer that points to constant "
1856                        "memory space");
1857   }
1858   unsigned int PointerSize =
1859       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1860 
1861   // Volatile Setting
1862   // - .volatile is only availalble for .global and .shared
1863   bool IsVolatile = MemSD->isVolatile();
1864   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1865       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1866       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1867     IsVolatile = false;
1868 
1869   // Type Setting: toType + toTypeWidth
1870   // - for integer type, always use 'u'
1871   assert(StoreVT.isSimple() && "Store value is not simple");
1872   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1873   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1874   unsigned ToType = getLdStRegType(ScalarVT);
1875 
1876   SmallVector<SDValue, 12> StOps;
1877   SDValue N2;
1878   unsigned VecType;
1879 
1880   switch (N->getOpcode()) {
1881   case NVPTXISD::StoreV2:
1882     VecType = NVPTX::PTXLdStInstCode::V2;
1883     StOps.push_back(N->getOperand(1));
1884     StOps.push_back(N->getOperand(2));
1885     N2 = N->getOperand(3);
1886     break;
1887   case NVPTXISD::StoreV4:
1888     VecType = NVPTX::PTXLdStInstCode::V4;
1889     StOps.push_back(N->getOperand(1));
1890     StOps.push_back(N->getOperand(2));
1891     StOps.push_back(N->getOperand(3));
1892     StOps.push_back(N->getOperand(4));
1893     N2 = N->getOperand(5);
1894     break;
1895   default:
1896     return false;
1897   }
1898 
1899   // v8x16 is a special case. PTX doesn't have st.v8.x16
1900   // instruction. Instead, we split the vector into v2x16 chunks and
1901   // store them with st.v4.b32.
1902   if (Isv2x16VT(EltVT)) {
1903     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1904     EltVT = MVT::i32;
1905     ToType = NVPTX::PTXLdStInstCode::Untyped;
1906     ToTypeWidth = 32;
1907   }
1908 
1909   StOps.push_back(getI32Imm(IsVolatile, DL));
1910   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1911   StOps.push_back(getI32Imm(VecType, DL));
1912   StOps.push_back(getI32Imm(ToType, DL));
1913   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1914 
1915   if (SelectDirectAddr(N2, Addr)) {
1916     switch (N->getOpcode()) {
1917     default:
1918       return false;
1919     case NVPTXISD::StoreV2:
1920       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1921                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1922                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1923                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1924       break;
1925     case NVPTXISD::StoreV4:
1926       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1927                                NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1928                                NVPTX::STV_i32_v4_avar, std::nullopt,
1929                                NVPTX::STV_f32_v4_avar, std::nullopt);
1930       break;
1931     }
1932     StOps.push_back(Addr);
1933   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1934                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1935     switch (N->getOpcode()) {
1936     default:
1937       return false;
1938     case NVPTXISD::StoreV2:
1939       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1940                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1941                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1942                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1943       break;
1944     case NVPTXISD::StoreV4:
1945       Opcode =
1946           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1947                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1948                           std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1949       break;
1950     }
1951     StOps.push_back(Base);
1952     StOps.push_back(Offset);
1953   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1954                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1955     if (PointerSize == 64) {
1956       switch (N->getOpcode()) {
1957       default:
1958         return false;
1959       case NVPTXISD::StoreV2:
1960         Opcode =
1961             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1962                             NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1963                             NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1964                             NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1965         break;
1966       case NVPTXISD::StoreV4:
1967         Opcode = pickOpcodeForVT(
1968             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1969             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1970             NVPTX::STV_f32_v4_ari_64, std::nullopt);
1971         break;
1972       }
1973     } else {
1974       switch (N->getOpcode()) {
1975       default:
1976         return false;
1977       case NVPTXISD::StoreV2:
1978         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1979                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1980                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1981                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1982         break;
1983       case NVPTXISD::StoreV4:
1984         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1985                                  NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1986                                  NVPTX::STV_i32_v4_ari, std::nullopt,
1987                                  NVPTX::STV_f32_v4_ari, std::nullopt);
1988         break;
1989       }
1990     }
1991     StOps.push_back(Base);
1992     StOps.push_back(Offset);
1993   } else {
1994     if (PointerSize == 64) {
1995       switch (N->getOpcode()) {
1996       default:
1997         return false;
1998       case NVPTXISD::StoreV2:
1999         Opcode = pickOpcodeForVT(
2000             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2001             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2002             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2003             NVPTX::STV_f64_v2_areg_64);
2004         break;
2005       case NVPTXISD::StoreV4:
2006         Opcode = pickOpcodeForVT(
2007             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2008             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2009             NVPTX::STV_f32_v4_areg_64, std::nullopt);
2010         break;
2011       }
2012     } else {
2013       switch (N->getOpcode()) {
2014       default:
2015         return false;
2016       case NVPTXISD::StoreV2:
2017         Opcode =
2018             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2019                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2020                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
2021                             NVPTX::STV_f64_v2_areg);
2022         break;
2023       case NVPTXISD::StoreV4:
2024         Opcode =
2025             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2026                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
2027                             std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
2028         break;
2029       }
2030     }
2031     StOps.push_back(N2);
2032   }
2033 
2034   if (!Opcode)
2035     return false;
2036 
2037   StOps.push_back(Chain);
2038 
2039   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2040 
2041   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2042   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2043 
2044   ReplaceNode(N, ST);
2045   return true;
2046 }
2047 
2048 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2049   SDValue Chain = Node->getOperand(0);
2050   SDValue Offset = Node->getOperand(2);
2051   SDValue Glue = Node->getOperand(3);
2052   SDLoc DL(Node);
2053   MemSDNode *Mem = cast<MemSDNode>(Node);
2054 
2055   unsigned VecSize;
2056   switch (Node->getOpcode()) {
2057   default:
2058     return false;
2059   case NVPTXISD::LoadParam:
2060     VecSize = 1;
2061     break;
2062   case NVPTXISD::LoadParamV2:
2063     VecSize = 2;
2064     break;
2065   case NVPTXISD::LoadParamV4:
2066     VecSize = 4;
2067     break;
2068   }
2069 
2070   EVT EltVT = Node->getValueType(0);
2071   EVT MemVT = Mem->getMemoryVT();
2072 
2073   std::optional<unsigned> Opcode;
2074 
2075   switch (VecSize) {
2076   default:
2077     return false;
2078   case 1:
2079     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2080                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2081                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2082                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2083     break;
2084   case 2:
2085     Opcode =
2086         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2087                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2088                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2089                         NVPTX::LoadParamMemV2F64);
2090     break;
2091   case 4:
2092     Opcode =
2093         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2094                         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2095                         std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2096     break;
2097   }
2098   if (!Opcode)
2099     return false;
2100 
2101   SDVTList VTs;
2102   if (VecSize == 1) {
2103     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2104   } else if (VecSize == 2) {
2105     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2106   } else {
2107     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2108     VTs = CurDAG->getVTList(EVTs);
2109   }
2110 
2111   unsigned OffsetVal = Offset->getAsZExtVal();
2112 
2113   SmallVector<SDValue, 2> Ops;
2114   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2115   Ops.push_back(Chain);
2116   Ops.push_back(Glue);
2117 
2118   ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2119   return true;
2120 }
2121 
2122 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2123   SDLoc DL(N);
2124   SDValue Chain = N->getOperand(0);
2125   SDValue Offset = N->getOperand(1);
2126   unsigned OffsetVal = Offset->getAsZExtVal();
2127   MemSDNode *Mem = cast<MemSDNode>(N);
2128 
2129   // How many elements do we have?
2130   unsigned NumElts = 1;
2131   switch (N->getOpcode()) {
2132   default:
2133     return false;
2134   case NVPTXISD::StoreRetval:
2135     NumElts = 1;
2136     break;
2137   case NVPTXISD::StoreRetvalV2:
2138     NumElts = 2;
2139     break;
2140   case NVPTXISD::StoreRetvalV4:
2141     NumElts = 4;
2142     break;
2143   }
2144 
2145   // Build vector of operands
2146   SmallVector<SDValue, 6> Ops;
2147   for (unsigned i = 0; i < NumElts; ++i)
2148     Ops.push_back(N->getOperand(i + 2));
2149   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2150   Ops.push_back(Chain);
2151 
2152   // Determine target opcode
2153   // If we have an i1, use an 8-bit store. The lowering code in
2154   // NVPTXISelLowering will have already emitted an upcast.
2155   std::optional<unsigned> Opcode = 0;
2156   switch (NumElts) {
2157   default:
2158     return false;
2159   case 1:
2160     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2161                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2162                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2163                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2164     if (Opcode == NVPTX::StoreRetvalI8) {
2165       // Fine tune the opcode depending on the size of the operand.
2166       // This helps to avoid creating redundant COPY instructions in
2167       // InstrEmitter::AddRegisterOperand().
2168       switch (Ops[0].getSimpleValueType().SimpleTy) {
2169       default:
2170         break;
2171       case MVT::i32:
2172         Opcode = NVPTX::StoreRetvalI8TruncI32;
2173         break;
2174       case MVT::i64:
2175         Opcode = NVPTX::StoreRetvalI8TruncI64;
2176         break;
2177       }
2178     }
2179     break;
2180   case 2:
2181     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2182                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2183                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2184                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2185     break;
2186   case 4:
2187     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2188                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2189                              NVPTX::StoreRetvalV4I32, std::nullopt,
2190                              NVPTX::StoreRetvalV4F32, std::nullopt);
2191     break;
2192   }
2193   if (!Opcode)
2194     return false;
2195 
2196   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2197   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2198   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2199 
2200   ReplaceNode(N, Ret);
2201   return true;
2202 }
2203 
2204 // Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2205 #define getOpcV2H(ty, opKind0, opKind1)                                        \
2206   NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2207 
2208 #define getOpcV2H1(ty, opKind0, isImm1)                                        \
2209   (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2210 
2211 #define getOpcodeForVectorStParamV2(ty, isimm)                                 \
2212   (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2213 
2214 #define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3)                      \
2215   NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2216 
2217 #define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3)                      \
2218   (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i)                       \
2219            : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2220 
2221 #define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3)                       \
2222   (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3)                       \
2223            : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2224 
2225 #define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3)                        \
2226   (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3)                        \
2227            : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2228 
2229 #define getOpcodeForVectorStParamV4(ty, isimm)                                 \
2230   (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3])                 \
2231              : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2232 
2233 #define getOpcodeForVectorStParam(n, ty, isimm)                                \
2234   (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm)                            \
2235            : getOpcodeForVectorStParamV4(ty, isimm)
2236 
2237 static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
2238                                            unsigned NumElts,
2239                                            MVT::SimpleValueType MemTy,
2240                                            SelectionDAG *CurDAG, SDLoc DL) {
2241   // Determine which inputs are registers and immediates make new operators
2242   // with constant values
2243   SmallVector<bool, 4> IsImm(NumElts, false);
2244   for (unsigned i = 0; i < NumElts; i++) {
2245     IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2246     if (IsImm[i]) {
2247       SDValue Imm = Ops[i];
2248       if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2249         const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2250         const ConstantFP *CF = ConstImm->getConstantFPValue();
2251         Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2252       } else {
2253         const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2254         const ConstantInt *CI = ConstImm->getConstantIntValue();
2255         Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2256       }
2257       Ops[i] = Imm;
2258     }
2259   }
2260 
2261   // Get opcode for MemTy, size, and register/immediate operand ordering
2262   switch (MemTy) {
2263   case MVT::i8:
2264     return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2265   case MVT::i16:
2266     return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2267   case MVT::i32:
2268     return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2269   case MVT::i64:
2270     assert(NumElts == 2 && "MVT too large for NumElts > 2");
2271     return getOpcodeForVectorStParamV2(I64, IsImm);
2272   case MVT::f32:
2273     return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2274   case MVT::f64:
2275     assert(NumElts == 2 && "MVT too large for NumElts > 2");
2276     return getOpcodeForVectorStParamV2(F64, IsImm);
2277 
2278   // These cases don't support immediates, just use the all register version
2279   // and generate moves.
2280   case MVT::i1:
2281     return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2282                           : NVPTX::StoreParamV4I8_rrrr;
2283   case MVT::f16:
2284   case MVT::bf16:
2285     return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2286                           : NVPTX::StoreParamV4I16_rrrr;
2287   case MVT::v2f16:
2288   case MVT::v2bf16:
2289   case MVT::v2i16:
2290   case MVT::v4i8:
2291     return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2292                           : NVPTX::StoreParamV4I32_rrrr;
2293   default:
2294     llvm_unreachable("Cannot select st.param for unknown MemTy");
2295   }
2296 }
2297 
2298 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2299   SDLoc DL(N);
2300   SDValue Chain = N->getOperand(0);
2301   SDValue Param = N->getOperand(1);
2302   unsigned ParamVal = Param->getAsZExtVal();
2303   SDValue Offset = N->getOperand(2);
2304   unsigned OffsetVal = Offset->getAsZExtVal();
2305   MemSDNode *Mem = cast<MemSDNode>(N);
2306   SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2307 
2308   // How many elements do we have?
2309   unsigned NumElts;
2310   switch (N->getOpcode()) {
2311   default:
2312     llvm_unreachable("Unexpected opcode");
2313   case NVPTXISD::StoreParamU32:
2314   case NVPTXISD::StoreParamS32:
2315   case NVPTXISD::StoreParam:
2316     NumElts = 1;
2317     break;
2318   case NVPTXISD::StoreParamV2:
2319     NumElts = 2;
2320     break;
2321   case NVPTXISD::StoreParamV4:
2322     NumElts = 4;
2323     break;
2324   }
2325 
2326   // Build vector of operands
2327   SmallVector<SDValue, 8> Ops;
2328   for (unsigned i = 0; i < NumElts; ++i)
2329     Ops.push_back(N->getOperand(i + 3));
2330   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2331   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2332   Ops.push_back(Chain);
2333   Ops.push_back(Glue);
2334 
2335   // Determine target opcode
2336   // If we have an i1, use an 8-bit store. The lowering code in
2337   // NVPTXISelLowering will have already emitted an upcast.
2338   std::optional<unsigned> Opcode;
2339   switch (N->getOpcode()) {
2340   default:
2341     switch (NumElts) {
2342     default:
2343       llvm_unreachable("Unexpected NumElts");
2344     case 1: {
2345       MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
2346       SDValue Imm = Ops[0];
2347       if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2348           (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2349         // Convert immediate to target constant
2350         if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2351           const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2352           const ConstantFP *CF = ConstImm->getConstantFPValue();
2353           Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2354         } else {
2355           const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2356           const ConstantInt *CI = ConstImm->getConstantIntValue();
2357           Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2358         }
2359         Ops[0] = Imm;
2360         // Use immediate version of store param
2361         Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2362                                  NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2363                                  NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2364                                  NVPTX::StoreParamF64_i);
2365       } else
2366         Opcode =
2367             pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2368                             NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2369                             NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2370                             NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2371       if (Opcode == NVPTX::StoreParamI8_r) {
2372         // Fine tune the opcode depending on the size of the operand.
2373         // This helps to avoid creating redundant COPY instructions in
2374         // InstrEmitter::AddRegisterOperand().
2375         switch (Ops[0].getSimpleValueType().SimpleTy) {
2376         default:
2377           break;
2378         case MVT::i32:
2379           Opcode = NVPTX::StoreParamI8TruncI32_r;
2380           break;
2381         case MVT::i64:
2382           Opcode = NVPTX::StoreParamI8TruncI64_r;
2383           break;
2384         }
2385       }
2386       break;
2387     }
2388     case 2:
2389     case 4: {
2390       MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
2391       Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2392       break;
2393     }
2394     }
2395     break;
2396   // Special case: if we have a sign-extend/zero-extend node, insert the
2397   // conversion instruction first, and use that as the value operand to
2398   // the selected StoreParam node.
2399   case NVPTXISD::StoreParamU32: {
2400     Opcode = NVPTX::StoreParamI32_r;
2401     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2402                                                 MVT::i32);
2403     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2404                                          MVT::i32, Ops[0], CvtNone);
2405     Ops[0] = SDValue(Cvt, 0);
2406     break;
2407   }
2408   case NVPTXISD::StoreParamS32: {
2409     Opcode = NVPTX::StoreParamI32_r;
2410     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2411                                                 MVT::i32);
2412     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2413                                          MVT::i32, Ops[0], CvtNone);
2414     Ops[0] = SDValue(Cvt, 0);
2415     break;
2416   }
2417   }
2418 
2419   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2420   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2421   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2422   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2423 
2424   ReplaceNode(N, Ret);
2425   return true;
2426 }
2427 
2428 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2429   unsigned Opc = 0;
2430 
2431   switch (N->getOpcode()) {
2432   default: return false;
2433   case NVPTXISD::Tex1DFloatS32:
2434     Opc = NVPTX::TEX_1D_F32_S32_RR;
2435     break;
2436   case NVPTXISD::Tex1DFloatFloat:
2437     Opc = NVPTX::TEX_1D_F32_F32_RR;
2438     break;
2439   case NVPTXISD::Tex1DFloatFloatLevel:
2440     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2441     break;
2442   case NVPTXISD::Tex1DFloatFloatGrad:
2443     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2444     break;
2445   case NVPTXISD::Tex1DS32S32:
2446     Opc = NVPTX::TEX_1D_S32_S32_RR;
2447     break;
2448   case NVPTXISD::Tex1DS32Float:
2449     Opc = NVPTX::TEX_1D_S32_F32_RR;
2450     break;
2451   case NVPTXISD::Tex1DS32FloatLevel:
2452     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2453     break;
2454   case NVPTXISD::Tex1DS32FloatGrad:
2455     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2456     break;
2457   case NVPTXISD::Tex1DU32S32:
2458     Opc = NVPTX::TEX_1D_U32_S32_RR;
2459     break;
2460   case NVPTXISD::Tex1DU32Float:
2461     Opc = NVPTX::TEX_1D_U32_F32_RR;
2462     break;
2463   case NVPTXISD::Tex1DU32FloatLevel:
2464     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2465     break;
2466   case NVPTXISD::Tex1DU32FloatGrad:
2467     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2468     break;
2469   case NVPTXISD::Tex1DArrayFloatS32:
2470     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2471     break;
2472   case NVPTXISD::Tex1DArrayFloatFloat:
2473     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2474     break;
2475   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2476     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2477     break;
2478   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2479     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2480     break;
2481   case NVPTXISD::Tex1DArrayS32S32:
2482     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2483     break;
2484   case NVPTXISD::Tex1DArrayS32Float:
2485     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2486     break;
2487   case NVPTXISD::Tex1DArrayS32FloatLevel:
2488     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2489     break;
2490   case NVPTXISD::Tex1DArrayS32FloatGrad:
2491     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2492     break;
2493   case NVPTXISD::Tex1DArrayU32S32:
2494     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2495     break;
2496   case NVPTXISD::Tex1DArrayU32Float:
2497     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2498     break;
2499   case NVPTXISD::Tex1DArrayU32FloatLevel:
2500     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2501     break;
2502   case NVPTXISD::Tex1DArrayU32FloatGrad:
2503     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2504     break;
2505   case NVPTXISD::Tex2DFloatS32:
2506     Opc = NVPTX::TEX_2D_F32_S32_RR;
2507     break;
2508   case NVPTXISD::Tex2DFloatFloat:
2509     Opc = NVPTX::TEX_2D_F32_F32_RR;
2510     break;
2511   case NVPTXISD::Tex2DFloatFloatLevel:
2512     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2513     break;
2514   case NVPTXISD::Tex2DFloatFloatGrad:
2515     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2516     break;
2517   case NVPTXISD::Tex2DS32S32:
2518     Opc = NVPTX::TEX_2D_S32_S32_RR;
2519     break;
2520   case NVPTXISD::Tex2DS32Float:
2521     Opc = NVPTX::TEX_2D_S32_F32_RR;
2522     break;
2523   case NVPTXISD::Tex2DS32FloatLevel:
2524     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2525     break;
2526   case NVPTXISD::Tex2DS32FloatGrad:
2527     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2528     break;
2529   case NVPTXISD::Tex2DU32S32:
2530     Opc = NVPTX::TEX_2D_U32_S32_RR;
2531     break;
2532   case NVPTXISD::Tex2DU32Float:
2533     Opc = NVPTX::TEX_2D_U32_F32_RR;
2534     break;
2535   case NVPTXISD::Tex2DU32FloatLevel:
2536     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2537     break;
2538   case NVPTXISD::Tex2DU32FloatGrad:
2539     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2540     break;
2541   case NVPTXISD::Tex2DArrayFloatS32:
2542     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2543     break;
2544   case NVPTXISD::Tex2DArrayFloatFloat:
2545     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2546     break;
2547   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2548     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2549     break;
2550   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2551     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2552     break;
2553   case NVPTXISD::Tex2DArrayS32S32:
2554     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2555     break;
2556   case NVPTXISD::Tex2DArrayS32Float:
2557     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2558     break;
2559   case NVPTXISD::Tex2DArrayS32FloatLevel:
2560     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2561     break;
2562   case NVPTXISD::Tex2DArrayS32FloatGrad:
2563     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2564     break;
2565   case NVPTXISD::Tex2DArrayU32S32:
2566     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2567     break;
2568   case NVPTXISD::Tex2DArrayU32Float:
2569     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2570     break;
2571   case NVPTXISD::Tex2DArrayU32FloatLevel:
2572     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2573     break;
2574   case NVPTXISD::Tex2DArrayU32FloatGrad:
2575     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2576     break;
2577   case NVPTXISD::Tex3DFloatS32:
2578     Opc = NVPTX::TEX_3D_F32_S32_RR;
2579     break;
2580   case NVPTXISD::Tex3DFloatFloat:
2581     Opc = NVPTX::TEX_3D_F32_F32_RR;
2582     break;
2583   case NVPTXISD::Tex3DFloatFloatLevel:
2584     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2585     break;
2586   case NVPTXISD::Tex3DFloatFloatGrad:
2587     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2588     break;
2589   case NVPTXISD::Tex3DS32S32:
2590     Opc = NVPTX::TEX_3D_S32_S32_RR;
2591     break;
2592   case NVPTXISD::Tex3DS32Float:
2593     Opc = NVPTX::TEX_3D_S32_F32_RR;
2594     break;
2595   case NVPTXISD::Tex3DS32FloatLevel:
2596     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2597     break;
2598   case NVPTXISD::Tex3DS32FloatGrad:
2599     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2600     break;
2601   case NVPTXISD::Tex3DU32S32:
2602     Opc = NVPTX::TEX_3D_U32_S32_RR;
2603     break;
2604   case NVPTXISD::Tex3DU32Float:
2605     Opc = NVPTX::TEX_3D_U32_F32_RR;
2606     break;
2607   case NVPTXISD::Tex3DU32FloatLevel:
2608     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2609     break;
2610   case NVPTXISD::Tex3DU32FloatGrad:
2611     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2612     break;
2613   case NVPTXISD::TexCubeFloatFloat:
2614     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2615     break;
2616   case NVPTXISD::TexCubeFloatFloatLevel:
2617     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2618     break;
2619   case NVPTXISD::TexCubeS32Float:
2620     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2621     break;
2622   case NVPTXISD::TexCubeS32FloatLevel:
2623     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2624     break;
2625   case NVPTXISD::TexCubeU32Float:
2626     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2627     break;
2628   case NVPTXISD::TexCubeU32FloatLevel:
2629     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2630     break;
2631   case NVPTXISD::TexCubeArrayFloatFloat:
2632     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2633     break;
2634   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2635     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2636     break;
2637   case NVPTXISD::TexCubeArrayS32Float:
2638     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2639     break;
2640   case NVPTXISD::TexCubeArrayS32FloatLevel:
2641     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2642     break;
2643   case NVPTXISD::TexCubeArrayU32Float:
2644     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2645     break;
2646   case NVPTXISD::TexCubeArrayU32FloatLevel:
2647     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2648     break;
2649   case NVPTXISD::Tld4R2DFloatFloat:
2650     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2651     break;
2652   case NVPTXISD::Tld4G2DFloatFloat:
2653     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2654     break;
2655   case NVPTXISD::Tld4B2DFloatFloat:
2656     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2657     break;
2658   case NVPTXISD::Tld4A2DFloatFloat:
2659     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2660     break;
2661   case NVPTXISD::Tld4R2DS64Float:
2662     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2663     break;
2664   case NVPTXISD::Tld4G2DS64Float:
2665     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2666     break;
2667   case NVPTXISD::Tld4B2DS64Float:
2668     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2669     break;
2670   case NVPTXISD::Tld4A2DS64Float:
2671     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2672     break;
2673   case NVPTXISD::Tld4R2DU64Float:
2674     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2675     break;
2676   case NVPTXISD::Tld4G2DU64Float:
2677     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2678     break;
2679   case NVPTXISD::Tld4B2DU64Float:
2680     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2681     break;
2682   case NVPTXISD::Tld4A2DU64Float:
2683     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2684     break;
2685   case NVPTXISD::TexUnified1DFloatS32:
2686     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2687     break;
2688   case NVPTXISD::TexUnified1DFloatFloat:
2689     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2690     break;
2691   case NVPTXISD::TexUnified1DFloatFloatLevel:
2692     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2693     break;
2694   case NVPTXISD::TexUnified1DFloatFloatGrad:
2695     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2696     break;
2697   case NVPTXISD::TexUnified1DS32S32:
2698     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2699     break;
2700   case NVPTXISD::TexUnified1DS32Float:
2701     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2702     break;
2703   case NVPTXISD::TexUnified1DS32FloatLevel:
2704     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2705     break;
2706   case NVPTXISD::TexUnified1DS32FloatGrad:
2707     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2708     break;
2709   case NVPTXISD::TexUnified1DU32S32:
2710     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2711     break;
2712   case NVPTXISD::TexUnified1DU32Float:
2713     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2714     break;
2715   case NVPTXISD::TexUnified1DU32FloatLevel:
2716     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2717     break;
2718   case NVPTXISD::TexUnified1DU32FloatGrad:
2719     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2720     break;
2721   case NVPTXISD::TexUnified1DArrayFloatS32:
2722     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2723     break;
2724   case NVPTXISD::TexUnified1DArrayFloatFloat:
2725     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2726     break;
2727   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2728     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2729     break;
2730   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2731     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2732     break;
2733   case NVPTXISD::TexUnified1DArrayS32S32:
2734     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2735     break;
2736   case NVPTXISD::TexUnified1DArrayS32Float:
2737     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2738     break;
2739   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2740     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2741     break;
2742   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2743     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2744     break;
2745   case NVPTXISD::TexUnified1DArrayU32S32:
2746     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2747     break;
2748   case NVPTXISD::TexUnified1DArrayU32Float:
2749     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2750     break;
2751   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2752     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2753     break;
2754   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2755     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2756     break;
2757   case NVPTXISD::TexUnified2DFloatS32:
2758     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2759     break;
2760   case NVPTXISD::TexUnified2DFloatFloat:
2761     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2762     break;
2763   case NVPTXISD::TexUnified2DFloatFloatLevel:
2764     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2765     break;
2766   case NVPTXISD::TexUnified2DFloatFloatGrad:
2767     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2768     break;
2769   case NVPTXISD::TexUnified2DS32S32:
2770     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2771     break;
2772   case NVPTXISD::TexUnified2DS32Float:
2773     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2774     break;
2775   case NVPTXISD::TexUnified2DS32FloatLevel:
2776     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2777     break;
2778   case NVPTXISD::TexUnified2DS32FloatGrad:
2779     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2780     break;
2781   case NVPTXISD::TexUnified2DU32S32:
2782     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2783     break;
2784   case NVPTXISD::TexUnified2DU32Float:
2785     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2786     break;
2787   case NVPTXISD::TexUnified2DU32FloatLevel:
2788     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2789     break;
2790   case NVPTXISD::TexUnified2DU32FloatGrad:
2791     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2792     break;
2793   case NVPTXISD::TexUnified2DArrayFloatS32:
2794     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2795     break;
2796   case NVPTXISD::TexUnified2DArrayFloatFloat:
2797     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2798     break;
2799   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2800     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2801     break;
2802   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2803     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2804     break;
2805   case NVPTXISD::TexUnified2DArrayS32S32:
2806     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2807     break;
2808   case NVPTXISD::TexUnified2DArrayS32Float:
2809     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2810     break;
2811   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2812     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2813     break;
2814   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2815     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2816     break;
2817   case NVPTXISD::TexUnified2DArrayU32S32:
2818     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2819     break;
2820   case NVPTXISD::TexUnified2DArrayU32Float:
2821     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2822     break;
2823   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2824     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2825     break;
2826   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2827     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2828     break;
2829   case NVPTXISD::TexUnified3DFloatS32:
2830     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2831     break;
2832   case NVPTXISD::TexUnified3DFloatFloat:
2833     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2834     break;
2835   case NVPTXISD::TexUnified3DFloatFloatLevel:
2836     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2837     break;
2838   case NVPTXISD::TexUnified3DFloatFloatGrad:
2839     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2840     break;
2841   case NVPTXISD::TexUnified3DS32S32:
2842     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2843     break;
2844   case NVPTXISD::TexUnified3DS32Float:
2845     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2846     break;
2847   case NVPTXISD::TexUnified3DS32FloatLevel:
2848     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2849     break;
2850   case NVPTXISD::TexUnified3DS32FloatGrad:
2851     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2852     break;
2853   case NVPTXISD::TexUnified3DU32S32:
2854     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2855     break;
2856   case NVPTXISD::TexUnified3DU32Float:
2857     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2858     break;
2859   case NVPTXISD::TexUnified3DU32FloatLevel:
2860     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2861     break;
2862   case NVPTXISD::TexUnified3DU32FloatGrad:
2863     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2864     break;
2865   case NVPTXISD::TexUnifiedCubeFloatFloat:
2866     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2867     break;
2868   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2869     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2870     break;
2871   case NVPTXISD::TexUnifiedCubeS32Float:
2872     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2873     break;
2874   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2875     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2876     break;
2877   case NVPTXISD::TexUnifiedCubeU32Float:
2878     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2879     break;
2880   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2881     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2882     break;
2883   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2884     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2885     break;
2886   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2887     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2888     break;
2889   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2890     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2891     break;
2892   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2893     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2894     break;
2895   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2896     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2897     break;
2898   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2899     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2900     break;
2901   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2902     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2903     break;
2904   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2905     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2906     break;
2907   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2908     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2909     break;
2910   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2911     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2912     break;
2913   case NVPTXISD::Tld4UnifiedR2DS64Float:
2914     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2915     break;
2916   case NVPTXISD::Tld4UnifiedG2DS64Float:
2917     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2918     break;
2919   case NVPTXISD::Tld4UnifiedB2DS64Float:
2920     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2921     break;
2922   case NVPTXISD::Tld4UnifiedA2DS64Float:
2923     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2924     break;
2925   case NVPTXISD::Tld4UnifiedR2DU64Float:
2926     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2927     break;
2928   case NVPTXISD::Tld4UnifiedG2DU64Float:
2929     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2930     break;
2931   case NVPTXISD::Tld4UnifiedB2DU64Float:
2932     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2933     break;
2934   case NVPTXISD::Tld4UnifiedA2DU64Float:
2935     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2936     break;
2937   case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
2938     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;
2939     break;
2940   case NVPTXISD::TexUnifiedCubeS32FloatGrad:
2941     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;
2942     break;
2943   case NVPTXISD::TexUnifiedCubeU32FloatGrad:
2944     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;
2945     break;
2946   case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
2947     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;
2948     break;
2949   case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
2950     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;
2951     break;
2952   case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
2953     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;
2954     break;
2955   }
2956 
2957   // Copy over operands
2958   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2959   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2960 
2961   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2962   return true;
2963 }
2964 
2965 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2966   unsigned Opc = 0;
2967   switch (N->getOpcode()) {
2968   default: return false;
2969   case NVPTXISD::Suld1DI8Clamp:
2970     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2971     break;
2972   case NVPTXISD::Suld1DI16Clamp:
2973     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2974     break;
2975   case NVPTXISD::Suld1DI32Clamp:
2976     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2977     break;
2978   case NVPTXISD::Suld1DI64Clamp:
2979     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2980     break;
2981   case NVPTXISD::Suld1DV2I8Clamp:
2982     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2983     break;
2984   case NVPTXISD::Suld1DV2I16Clamp:
2985     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2986     break;
2987   case NVPTXISD::Suld1DV2I32Clamp:
2988     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2989     break;
2990   case NVPTXISD::Suld1DV2I64Clamp:
2991     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2992     break;
2993   case NVPTXISD::Suld1DV4I8Clamp:
2994     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2995     break;
2996   case NVPTXISD::Suld1DV4I16Clamp:
2997     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2998     break;
2999   case NVPTXISD::Suld1DV4I32Clamp:
3000     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
3001     break;
3002   case NVPTXISD::Suld1DArrayI8Clamp:
3003     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
3004     break;
3005   case NVPTXISD::Suld1DArrayI16Clamp:
3006     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
3007     break;
3008   case NVPTXISD::Suld1DArrayI32Clamp:
3009     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
3010     break;
3011   case NVPTXISD::Suld1DArrayI64Clamp:
3012     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
3013     break;
3014   case NVPTXISD::Suld1DArrayV2I8Clamp:
3015     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
3016     break;
3017   case NVPTXISD::Suld1DArrayV2I16Clamp:
3018     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
3019     break;
3020   case NVPTXISD::Suld1DArrayV2I32Clamp:
3021     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
3022     break;
3023   case NVPTXISD::Suld1DArrayV2I64Clamp:
3024     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
3025     break;
3026   case NVPTXISD::Suld1DArrayV4I8Clamp:
3027     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
3028     break;
3029   case NVPTXISD::Suld1DArrayV4I16Clamp:
3030     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
3031     break;
3032   case NVPTXISD::Suld1DArrayV4I32Clamp:
3033     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
3034     break;
3035   case NVPTXISD::Suld2DI8Clamp:
3036     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
3037     break;
3038   case NVPTXISD::Suld2DI16Clamp:
3039     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
3040     break;
3041   case NVPTXISD::Suld2DI32Clamp:
3042     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
3043     break;
3044   case NVPTXISD::Suld2DI64Clamp:
3045     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
3046     break;
3047   case NVPTXISD::Suld2DV2I8Clamp:
3048     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
3049     break;
3050   case NVPTXISD::Suld2DV2I16Clamp:
3051     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
3052     break;
3053   case NVPTXISD::Suld2DV2I32Clamp:
3054     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
3055     break;
3056   case NVPTXISD::Suld2DV2I64Clamp:
3057     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
3058     break;
3059   case NVPTXISD::Suld2DV4I8Clamp:
3060     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
3061     break;
3062   case NVPTXISD::Suld2DV4I16Clamp:
3063     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
3064     break;
3065   case NVPTXISD::Suld2DV4I32Clamp:
3066     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
3067     break;
3068   case NVPTXISD::Suld2DArrayI8Clamp:
3069     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
3070     break;
3071   case NVPTXISD::Suld2DArrayI16Clamp:
3072     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
3073     break;
3074   case NVPTXISD::Suld2DArrayI32Clamp:
3075     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
3076     break;
3077   case NVPTXISD::Suld2DArrayI64Clamp:
3078     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
3079     break;
3080   case NVPTXISD::Suld2DArrayV2I8Clamp:
3081     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
3082     break;
3083   case NVPTXISD::Suld2DArrayV2I16Clamp:
3084     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
3085     break;
3086   case NVPTXISD::Suld2DArrayV2I32Clamp:
3087     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
3088     break;
3089   case NVPTXISD::Suld2DArrayV2I64Clamp:
3090     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
3091     break;
3092   case NVPTXISD::Suld2DArrayV4I8Clamp:
3093     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
3094     break;
3095   case NVPTXISD::Suld2DArrayV4I16Clamp:
3096     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
3097     break;
3098   case NVPTXISD::Suld2DArrayV4I32Clamp:
3099     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
3100     break;
3101   case NVPTXISD::Suld3DI8Clamp:
3102     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3103     break;
3104   case NVPTXISD::Suld3DI16Clamp:
3105     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3106     break;
3107   case NVPTXISD::Suld3DI32Clamp:
3108     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3109     break;
3110   case NVPTXISD::Suld3DI64Clamp:
3111     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3112     break;
3113   case NVPTXISD::Suld3DV2I8Clamp:
3114     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3115     break;
3116   case NVPTXISD::Suld3DV2I16Clamp:
3117     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3118     break;
3119   case NVPTXISD::Suld3DV2I32Clamp:
3120     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3121     break;
3122   case NVPTXISD::Suld3DV2I64Clamp:
3123     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3124     break;
3125   case NVPTXISD::Suld3DV4I8Clamp:
3126     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3127     break;
3128   case NVPTXISD::Suld3DV4I16Clamp:
3129     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3130     break;
3131   case NVPTXISD::Suld3DV4I32Clamp:
3132     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3133     break;
3134   case NVPTXISD::Suld1DI8Trap:
3135     Opc = NVPTX::SULD_1D_I8_TRAP_R;
3136     break;
3137   case NVPTXISD::Suld1DI16Trap:
3138     Opc = NVPTX::SULD_1D_I16_TRAP_R;
3139     break;
3140   case NVPTXISD::Suld1DI32Trap:
3141     Opc = NVPTX::SULD_1D_I32_TRAP_R;
3142     break;
3143   case NVPTXISD::Suld1DI64Trap:
3144     Opc = NVPTX::SULD_1D_I64_TRAP_R;
3145     break;
3146   case NVPTXISD::Suld1DV2I8Trap:
3147     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3148     break;
3149   case NVPTXISD::Suld1DV2I16Trap:
3150     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3151     break;
3152   case NVPTXISD::Suld1DV2I32Trap:
3153     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3154     break;
3155   case NVPTXISD::Suld1DV2I64Trap:
3156     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3157     break;
3158   case NVPTXISD::Suld1DV4I8Trap:
3159     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3160     break;
3161   case NVPTXISD::Suld1DV4I16Trap:
3162     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3163     break;
3164   case NVPTXISD::Suld1DV4I32Trap:
3165     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3166     break;
3167   case NVPTXISD::Suld1DArrayI8Trap:
3168     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3169     break;
3170   case NVPTXISD::Suld1DArrayI16Trap:
3171     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3172     break;
3173   case NVPTXISD::Suld1DArrayI32Trap:
3174     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3175     break;
3176   case NVPTXISD::Suld1DArrayI64Trap:
3177     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3178     break;
3179   case NVPTXISD::Suld1DArrayV2I8Trap:
3180     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3181     break;
3182   case NVPTXISD::Suld1DArrayV2I16Trap:
3183     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3184     break;
3185   case NVPTXISD::Suld1DArrayV2I32Trap:
3186     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3187     break;
3188   case NVPTXISD::Suld1DArrayV2I64Trap:
3189     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3190     break;
3191   case NVPTXISD::Suld1DArrayV4I8Trap:
3192     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3193     break;
3194   case NVPTXISD::Suld1DArrayV4I16Trap:
3195     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3196     break;
3197   case NVPTXISD::Suld1DArrayV4I32Trap:
3198     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3199     break;
3200   case NVPTXISD::Suld2DI8Trap:
3201     Opc = NVPTX::SULD_2D_I8_TRAP_R;
3202     break;
3203   case NVPTXISD::Suld2DI16Trap:
3204     Opc = NVPTX::SULD_2D_I16_TRAP_R;
3205     break;
3206   case NVPTXISD::Suld2DI32Trap:
3207     Opc = NVPTX::SULD_2D_I32_TRAP_R;
3208     break;
3209   case NVPTXISD::Suld2DI64Trap:
3210     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3211     break;
3212   case NVPTXISD::Suld2DV2I8Trap:
3213     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3214     break;
3215   case NVPTXISD::Suld2DV2I16Trap:
3216     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3217     break;
3218   case NVPTXISD::Suld2DV2I32Trap:
3219     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3220     break;
3221   case NVPTXISD::Suld2DV2I64Trap:
3222     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3223     break;
3224   case NVPTXISD::Suld2DV4I8Trap:
3225     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3226     break;
3227   case NVPTXISD::Suld2DV4I16Trap:
3228     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3229     break;
3230   case NVPTXISD::Suld2DV4I32Trap:
3231     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3232     break;
3233   case NVPTXISD::Suld2DArrayI8Trap:
3234     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3235     break;
3236   case NVPTXISD::Suld2DArrayI16Trap:
3237     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3238     break;
3239   case NVPTXISD::Suld2DArrayI32Trap:
3240     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3241     break;
3242   case NVPTXISD::Suld2DArrayI64Trap:
3243     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3244     break;
3245   case NVPTXISD::Suld2DArrayV2I8Trap:
3246     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3247     break;
3248   case NVPTXISD::Suld2DArrayV2I16Trap:
3249     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3250     break;
3251   case NVPTXISD::Suld2DArrayV2I32Trap:
3252     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3253     break;
3254   case NVPTXISD::Suld2DArrayV2I64Trap:
3255     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3256     break;
3257   case NVPTXISD::Suld2DArrayV4I8Trap:
3258     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3259     break;
3260   case NVPTXISD::Suld2DArrayV4I16Trap:
3261     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3262     break;
3263   case NVPTXISD::Suld2DArrayV4I32Trap:
3264     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3265     break;
3266   case NVPTXISD::Suld3DI8Trap:
3267     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3268     break;
3269   case NVPTXISD::Suld3DI16Trap:
3270     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3271     break;
3272   case NVPTXISD::Suld3DI32Trap:
3273     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3274     break;
3275   case NVPTXISD::Suld3DI64Trap:
3276     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3277     break;
3278   case NVPTXISD::Suld3DV2I8Trap:
3279     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3280     break;
3281   case NVPTXISD::Suld3DV2I16Trap:
3282     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3283     break;
3284   case NVPTXISD::Suld3DV2I32Trap:
3285     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3286     break;
3287   case NVPTXISD::Suld3DV2I64Trap:
3288     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3289     break;
3290   case NVPTXISD::Suld3DV4I8Trap:
3291     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3292     break;
3293   case NVPTXISD::Suld3DV4I16Trap:
3294     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3295     break;
3296   case NVPTXISD::Suld3DV4I32Trap:
3297     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3298     break;
3299   case NVPTXISD::Suld1DI8Zero:
3300     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3301     break;
3302   case NVPTXISD::Suld1DI16Zero:
3303     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3304     break;
3305   case NVPTXISD::Suld1DI32Zero:
3306     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3307     break;
3308   case NVPTXISD::Suld1DI64Zero:
3309     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3310     break;
3311   case NVPTXISD::Suld1DV2I8Zero:
3312     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3313     break;
3314   case NVPTXISD::Suld1DV2I16Zero:
3315     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3316     break;
3317   case NVPTXISD::Suld1DV2I32Zero:
3318     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3319     break;
3320   case NVPTXISD::Suld1DV2I64Zero:
3321     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3322     break;
3323   case NVPTXISD::Suld1DV4I8Zero:
3324     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3325     break;
3326   case NVPTXISD::Suld1DV4I16Zero:
3327     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3328     break;
3329   case NVPTXISD::Suld1DV4I32Zero:
3330     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3331     break;
3332   case NVPTXISD::Suld1DArrayI8Zero:
3333     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3334     break;
3335   case NVPTXISD::Suld1DArrayI16Zero:
3336     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3337     break;
3338   case NVPTXISD::Suld1DArrayI32Zero:
3339     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3340     break;
3341   case NVPTXISD::Suld1DArrayI64Zero:
3342     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3343     break;
3344   case NVPTXISD::Suld1DArrayV2I8Zero:
3345     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3346     break;
3347   case NVPTXISD::Suld1DArrayV2I16Zero:
3348     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3349     break;
3350   case NVPTXISD::Suld1DArrayV2I32Zero:
3351     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3352     break;
3353   case NVPTXISD::Suld1DArrayV2I64Zero:
3354     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3355     break;
3356   case NVPTXISD::Suld1DArrayV4I8Zero:
3357     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3358     break;
3359   case NVPTXISD::Suld1DArrayV4I16Zero:
3360     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3361     break;
3362   case NVPTXISD::Suld1DArrayV4I32Zero:
3363     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3364     break;
3365   case NVPTXISD::Suld2DI8Zero:
3366     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3367     break;
3368   case NVPTXISD::Suld2DI16Zero:
3369     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3370     break;
3371   case NVPTXISD::Suld2DI32Zero:
3372     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3373     break;
3374   case NVPTXISD::Suld2DI64Zero:
3375     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3376     break;
3377   case NVPTXISD::Suld2DV2I8Zero:
3378     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3379     break;
3380   case NVPTXISD::Suld2DV2I16Zero:
3381     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3382     break;
3383   case NVPTXISD::Suld2DV2I32Zero:
3384     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3385     break;
3386   case NVPTXISD::Suld2DV2I64Zero:
3387     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3388     break;
3389   case NVPTXISD::Suld2DV4I8Zero:
3390     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3391     break;
3392   case NVPTXISD::Suld2DV4I16Zero:
3393     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3394     break;
3395   case NVPTXISD::Suld2DV4I32Zero:
3396     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3397     break;
3398   case NVPTXISD::Suld2DArrayI8Zero:
3399     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3400     break;
3401   case NVPTXISD::Suld2DArrayI16Zero:
3402     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3403     break;
3404   case NVPTXISD::Suld2DArrayI32Zero:
3405     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3406     break;
3407   case NVPTXISD::Suld2DArrayI64Zero:
3408     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3409     break;
3410   case NVPTXISD::Suld2DArrayV2I8Zero:
3411     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3412     break;
3413   case NVPTXISD::Suld2DArrayV2I16Zero:
3414     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3415     break;
3416   case NVPTXISD::Suld2DArrayV2I32Zero:
3417     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3418     break;
3419   case NVPTXISD::Suld2DArrayV2I64Zero:
3420     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3421     break;
3422   case NVPTXISD::Suld2DArrayV4I8Zero:
3423     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3424     break;
3425   case NVPTXISD::Suld2DArrayV4I16Zero:
3426     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3427     break;
3428   case NVPTXISD::Suld2DArrayV4I32Zero:
3429     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3430     break;
3431   case NVPTXISD::Suld3DI8Zero:
3432     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3433     break;
3434   case NVPTXISD::Suld3DI16Zero:
3435     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3436     break;
3437   case NVPTXISD::Suld3DI32Zero:
3438     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3439     break;
3440   case NVPTXISD::Suld3DI64Zero:
3441     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3442     break;
3443   case NVPTXISD::Suld3DV2I8Zero:
3444     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3445     break;
3446   case NVPTXISD::Suld3DV2I16Zero:
3447     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3448     break;
3449   case NVPTXISD::Suld3DV2I32Zero:
3450     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3451     break;
3452   case NVPTXISD::Suld3DV2I64Zero:
3453     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3454     break;
3455   case NVPTXISD::Suld3DV4I8Zero:
3456     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3457     break;
3458   case NVPTXISD::Suld3DV4I16Zero:
3459     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3460     break;
3461   case NVPTXISD::Suld3DV4I32Zero:
3462     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3463     break;
3464   }
3465 
3466   // Copy over operands
3467   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3468   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3469 
3470   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3471   return true;
3472 }
3473 
3474 
3475 /// SelectBFE - Look for instruction sequences that can be made more efficient
3476 /// by using the 'bfe' (bit-field extract) PTX instruction
3477 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3478   SDLoc DL(N);
3479   SDValue LHS = N->getOperand(0);
3480   SDValue RHS = N->getOperand(1);
3481   SDValue Len;
3482   SDValue Start;
3483   SDValue Val;
3484   bool IsSigned = false;
3485 
3486   if (N->getOpcode() == ISD::AND) {
3487     // Canonicalize the operands
3488     // We want 'and %val, %mask'
3489     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3490       std::swap(LHS, RHS);
3491     }
3492 
3493     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3494     if (!Mask) {
3495       // We need a constant mask on the RHS of the AND
3496       return false;
3497     }
3498 
3499     // Extract the mask bits
3500     uint64_t MaskVal = Mask->getZExtValue();
3501     if (!isMask_64(MaskVal)) {
3502       // We *could* handle shifted masks here, but doing so would require an
3503       // 'and' operation to fix up the low-order bits so we would trade
3504       // shr+and for bfe+and, which has the same throughput
3505       return false;
3506     }
3507 
3508     // How many bits are in our mask?
3509     int64_t NumBits = countr_one(MaskVal);
3510     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3511 
3512     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3513       // We have a 'srl/and' pair, extract the effective start bit and length
3514       Val = LHS.getNode()->getOperand(0);
3515       Start = LHS.getNode()->getOperand(1);
3516       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3517       if (StartConst) {
3518         uint64_t StartVal = StartConst->getZExtValue();
3519         // How many "good" bits do we have left?  "good" is defined here as bits
3520         // that exist in the original value, not shifted in.
3521         int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3522         if (NumBits > GoodBits) {
3523           // Do not handle the case where bits have been shifted in. In theory
3524           // we could handle this, but the cost is likely higher than just
3525           // emitting the srl/and pair.
3526           return false;
3527         }
3528         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3529       } else {
3530         // Do not handle the case where the shift amount (can be zero if no srl
3531         // was found) is not constant. We could handle this case, but it would
3532         // require run-time logic that would be more expensive than just
3533         // emitting the srl/and pair.
3534         return false;
3535       }
3536     } else {
3537       // Do not handle the case where the LHS of the and is not a shift. While
3538       // it would be trivial to handle this case, it would just transform
3539       // 'and' -> 'bfe', but 'and' has higher-throughput.
3540       return false;
3541     }
3542   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3543     if (LHS->getOpcode() == ISD::AND) {
3544       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3545       if (!ShiftCnst) {
3546         // Shift amount must be constant
3547         return false;
3548       }
3549 
3550       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3551 
3552       SDValue AndLHS = LHS->getOperand(0);
3553       SDValue AndRHS = LHS->getOperand(1);
3554 
3555       // Canonicalize the AND to have the mask on the RHS
3556       if (isa<ConstantSDNode>(AndLHS)) {
3557         std::swap(AndLHS, AndRHS);
3558       }
3559 
3560       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3561       if (!MaskCnst) {
3562         // Mask must be constant
3563         return false;
3564       }
3565 
3566       uint64_t MaskVal = MaskCnst->getZExtValue();
3567       uint64_t NumZeros;
3568       uint64_t NumBits;
3569       if (isMask_64(MaskVal)) {
3570         NumZeros = 0;
3571         // The number of bits in the result bitfield will be the number of
3572         // trailing ones (the AND) minus the number of bits we shift off
3573         NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3574       } else if (isShiftedMask_64(MaskVal)) {
3575         NumZeros = llvm::countr_zero(MaskVal);
3576         unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3577         // The number of bits in the result bitfield will be the number of
3578         // trailing zeros plus the number of set bits in the mask minus the
3579         // number of bits we shift off
3580         NumBits = NumZeros + NumOnes - ShiftAmt;
3581       } else {
3582         // This is not a mask we can handle
3583         return false;
3584       }
3585 
3586       if (ShiftAmt < NumZeros) {
3587         // Handling this case would require extra logic that would make this
3588         // transformation non-profitable
3589         return false;
3590       }
3591 
3592       Val = AndLHS;
3593       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3594       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3595     } else if (LHS->getOpcode() == ISD::SHL) {
3596       // Here, we have a pattern like:
3597       //
3598       // (sra (shl val, NN), MM)
3599       // or
3600       // (srl (shl val, NN), MM)
3601       //
3602       // If MM >= NN, we can efficiently optimize this with bfe
3603       Val = LHS->getOperand(0);
3604 
3605       SDValue ShlRHS = LHS->getOperand(1);
3606       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3607       if (!ShlCnst) {
3608         // Shift amount must be constant
3609         return false;
3610       }
3611       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3612 
3613       SDValue ShrRHS = RHS;
3614       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3615       if (!ShrCnst) {
3616         // Shift amount must be constant
3617         return false;
3618       }
3619       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3620 
3621       // To avoid extra codegen and be profitable, we need Outer >= Inner
3622       if (OuterShiftAmt < InnerShiftAmt) {
3623         return false;
3624       }
3625 
3626       // If the outer shift is more than the type size, we have no bitfield to
3627       // extract (since we also check that the inner shift is <= the outer shift
3628       // then this also implies that the inner shift is < the type size)
3629       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3630         return false;
3631       }
3632 
3633       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3634                                         MVT::i32);
3635       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3636                                       DL, MVT::i32);
3637 
3638       if (N->getOpcode() == ISD::SRA) {
3639         // If we have a arithmetic right shift, we need to use the signed bfe
3640         // variant
3641         IsSigned = true;
3642       }
3643     } else {
3644       // No can do...
3645       return false;
3646     }
3647   } else {
3648     // No can do...
3649     return false;
3650   }
3651 
3652 
3653   unsigned Opc;
3654   // For the BFE operations we form here from "and" and "srl", always use the
3655   // unsigned variants.
3656   if (Val.getValueType() == MVT::i32) {
3657     if (IsSigned) {
3658       Opc = NVPTX::BFE_S32rii;
3659     } else {
3660       Opc = NVPTX::BFE_U32rii;
3661     }
3662   } else if (Val.getValueType() == MVT::i64) {
3663     if (IsSigned) {
3664       Opc = NVPTX::BFE_S64rii;
3665     } else {
3666       Opc = NVPTX::BFE_U64rii;
3667     }
3668   } else {
3669     // We cannot handle this type
3670     return false;
3671   }
3672 
3673   SDValue Ops[] = {
3674     Val, Start, Len
3675   };
3676 
3677   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3678   return true;
3679 }
3680 
3681 // SelectDirectAddr - Match a direct address for DAG.
3682 // A direct address could be a globaladdress or externalsymbol.
3683 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3684   // Return true if TGA or ES.
3685   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3686       N.getOpcode() == ISD::TargetExternalSymbol) {
3687     Address = N;
3688     return true;
3689   }
3690   if (N.getOpcode() == NVPTXISD::Wrapper) {
3691     Address = N.getOperand(0);
3692     return true;
3693   }
3694   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3695   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3696     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3697         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3698         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3699       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3700   }
3701   return false;
3702 }
3703 
3704 // symbol+offset
3705 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3706     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3707   if (Addr.getOpcode() == ISD::ADD) {
3708     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3709       SDValue base = Addr.getOperand(0);
3710       if (SelectDirectAddr(base, Base)) {
3711         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3712                                            mvt);
3713         return true;
3714       }
3715     }
3716   }
3717   return false;
3718 }
3719 
3720 // symbol+offset
3721 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3722                                      SDValue &Base, SDValue &Offset) {
3723   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3724 }
3725 
3726 // symbol+offset
3727 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3728                                        SDValue &Base, SDValue &Offset) {
3729   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3730 }
3731 
3732 // register+offset
3733 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3734     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3735   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3736     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3737     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3738     return true;
3739   }
3740   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3741       Addr.getOpcode() == ISD::TargetGlobalAddress)
3742     return false; // direct calls.
3743 
3744   if (Addr.getOpcode() == ISD::ADD) {
3745     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3746       return false;
3747     }
3748     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3749       if (FrameIndexSDNode *FIN =
3750               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3751         // Constant offset from frame ref.
3752         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3753       else
3754         Base = Addr.getOperand(0);
3755 
3756       // Offset must fit in a 32-bit signed int in PTX [register+offset] address
3757       // mode
3758       if (!CN->getAPIntValue().isSignedIntN(32))
3759         return false;
3760 
3761       Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
3762                                          MVT::i32);
3763       return true;
3764     }
3765   }
3766   return false;
3767 }
3768 
3769 // register+offset
3770 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3771                                      SDValue &Base, SDValue &Offset) {
3772   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3773 }
3774 
3775 // register+offset
3776 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3777                                        SDValue &Base, SDValue &Offset) {
3778   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3779 }
3780 
3781 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3782                                                  unsigned int spN) const {
3783   const Value *Src = nullptr;
3784   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3785     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3786       return true;
3787     Src = mN->getMemOperand()->getValue();
3788   }
3789   if (!Src)
3790     return false;
3791   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3792     return (PT->getAddressSpace() == spN);
3793   return false;
3794 }
3795 
3796 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3797 /// inline asm expressions.
3798 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3799     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3800     std::vector<SDValue> &OutOps) {
3801   SDValue Op0, Op1;
3802   switch (ConstraintID) {
3803   default:
3804     return true;
3805   case InlineAsm::ConstraintCode::m: // memory
3806     if (SelectDirectAddr(Op, Op0)) {
3807       OutOps.push_back(Op0);
3808       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3809       return false;
3810     }
3811     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3812       OutOps.push_back(Op0);
3813       OutOps.push_back(Op1);
3814       return false;
3815     }
3816     break;
3817   }
3818   return true;
3819 }
3820 
3821 void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
3822   // Lower a CopyToReg with two 64-bit inputs
3823   // Dst:i128, lo:i64, hi:i64
3824   //
3825   // CopyToReg Dst, lo, hi;
3826   //
3827   // ==>
3828   //
3829   // tmp = V2I64toI128 {lo, hi};
3830   // CopyToReg Dst, tmp;
3831   SDValue Dst = N->getOperand(1);
3832   SDValue Lo = N->getOperand(2);
3833   SDValue Hi = N->getOperand(3);
3834 
3835   SDLoc DL(N);
3836   SDNode *Mov =
3837       CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
3838 
3839   SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
3840   NewOps[0] = N->getOperand(0);
3841   NewOps[1] = Dst;
3842   NewOps[2] = SDValue(Mov, 0);
3843   if (N->getNumOperands() == 5)
3844     NewOps[3] = N->getOperand(4);
3845   SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
3846 
3847   ReplaceNode(N, NewValue.getNode());
3848 }
3849 
3850 void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
3851   // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
3852   // Dst:i128, Src:i128
3853   //
3854   // {lo, hi} = CopyFromReg Src
3855   //
3856   // ==>
3857   //
3858   // {lo, hi} = I128toV2I64 Src
3859   //
3860   SDValue Ch = N->getOperand(0);
3861   SDValue Src = N->getOperand(1);
3862   SDValue Glue = N->getOperand(2);
3863   SDLoc DL(N);
3864 
3865   // Add Glue and Ch to the operands and results to avoid break the execution
3866   // order
3867   SDNode *Mov = CurDAG->getMachineNode(
3868       NVPTX::I128toV2I64, DL,
3869       {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
3870       {Src, Ch, Glue});
3871 
3872   ReplaceNode(N, Mov);
3873 }
3874 
3875 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3876 /// conversion from \p SrcTy to \p DestTy.
3877 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3878                                              LoadSDNode *LdNode) {
3879   bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3880   switch (SrcTy.SimpleTy) {
3881   default:
3882     llvm_unreachable("Unhandled source type");
3883   case MVT::i8:
3884     switch (DestTy.SimpleTy) {
3885     default:
3886       llvm_unreachable("Unhandled dest type");
3887     case MVT::i16:
3888       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3889     case MVT::i32:
3890       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3891     case MVT::i64:
3892       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3893     }
3894   case MVT::i16:
3895     switch (DestTy.SimpleTy) {
3896     default:
3897       llvm_unreachable("Unhandled dest type");
3898     case MVT::i8:
3899       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3900     case MVT::i32:
3901       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3902     case MVT::i64:
3903       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3904     }
3905   case MVT::i32:
3906     switch (DestTy.SimpleTy) {
3907     default:
3908       llvm_unreachable("Unhandled dest type");
3909     case MVT::i8:
3910       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3911     case MVT::i16:
3912       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3913     case MVT::i64:
3914       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3915     }
3916   case MVT::i64:
3917     switch (DestTy.SimpleTy) {
3918     default:
3919       llvm_unreachable("Unhandled dest type");
3920     case MVT::i8:
3921       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3922     case MVT::i16:
3923       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3924     case MVT::i32:
3925       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3926     }
3927   case MVT::f16:
3928     switch (DestTy.SimpleTy) {
3929     default:
3930       llvm_unreachable("Unhandled dest type");
3931     case MVT::f32:
3932       return NVPTX::CVT_f32_f16;
3933     case MVT::f64:
3934       return NVPTX::CVT_f64_f16;
3935     }
3936   }
3937 }
3938