xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "nvptx-isel"
30 
31 /// createNVPTXISelDag - This pass converts a legalized DAG into a
32 /// NVPTX-specific DAG, ready for instruction scheduling.
33 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
34                                        llvm::CodeGenOpt::Level OptLevel) {
35   return new NVPTXDAGToDAGISel(TM, OptLevel);
36 }
37 
38 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
39                                      CodeGenOpt::Level OptLevel)
40     : SelectionDAGISel(tm, OptLevel), TM(tm) {
41   doMulWide = (OptLevel > 0);
42 }
43 
44 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
45   Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
46   return SelectionDAGISel::runOnMachineFunction(MF);
47 }
48 
49 int NVPTXDAGToDAGISel::getDivF32Level() const {
50   return Subtarget->getTargetLowering()->getDivF32Level();
51 }
52 
53 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
54   return Subtarget->getTargetLowering()->usePrecSqrtF32();
55 }
56 
57 bool NVPTXDAGToDAGISel::useF32FTZ() const {
58   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
59 }
60 
61 bool NVPTXDAGToDAGISel::allowFMA() const {
62   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
63   return TL->allowFMA(*MF, OptLevel);
64 }
65 
66 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
67   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68   return TL->allowUnsafeFPMath(*MF);
69 }
70 
71 bool NVPTXDAGToDAGISel::useShortPointers() const {
72   return TM.useShortPointers();
73 }
74 
75 /// Select - Select instructions not customized! Used for
76 /// expanded, promoted and normal instructions.
77 void NVPTXDAGToDAGISel::Select(SDNode *N) {
78 
79   if (N->isMachineOpcode()) {
80     N->setNodeId(-1);
81     return; // Already selected.
82   }
83 
84   switch (N->getOpcode()) {
85   case ISD::LOAD:
86   case ISD::ATOMIC_LOAD:
87     if (tryLoad(N))
88       return;
89     break;
90   case ISD::STORE:
91   case ISD::ATOMIC_STORE:
92     if (tryStore(N))
93       return;
94     break;
95   case ISD::EXTRACT_VECTOR_ELT:
96     if (tryEXTRACT_VECTOR_ELEMENT(N))
97       return;
98     break;
99   case NVPTXISD::SETP_F16X2:
100     SelectSETP_F16X2(N);
101     return;
102 
103   case NVPTXISD::LoadV2:
104   case NVPTXISD::LoadV4:
105     if (tryLoadVector(N))
106       return;
107     break;
108   case NVPTXISD::LDGV2:
109   case NVPTXISD::LDGV4:
110   case NVPTXISD::LDUV2:
111   case NVPTXISD::LDUV4:
112     if (tryLDGLDU(N))
113       return;
114     break;
115   case NVPTXISD::StoreV2:
116   case NVPTXISD::StoreV4:
117     if (tryStoreVector(N))
118       return;
119     break;
120   case NVPTXISD::LoadParam:
121   case NVPTXISD::LoadParamV2:
122   case NVPTXISD::LoadParamV4:
123     if (tryLoadParam(N))
124       return;
125     break;
126   case NVPTXISD::StoreRetval:
127   case NVPTXISD::StoreRetvalV2:
128   case NVPTXISD::StoreRetvalV4:
129     if (tryStoreRetval(N))
130       return;
131     break;
132   case NVPTXISD::StoreParam:
133   case NVPTXISD::StoreParamV2:
134   case NVPTXISD::StoreParamV4:
135   case NVPTXISD::StoreParamS32:
136   case NVPTXISD::StoreParamU32:
137     if (tryStoreParam(N))
138       return;
139     break;
140   case ISD::INTRINSIC_WO_CHAIN:
141     if (tryIntrinsicNoChain(N))
142       return;
143     break;
144   case ISD::INTRINSIC_W_CHAIN:
145     if (tryIntrinsicChain(N))
146       return;
147     break;
148   case NVPTXISD::Tex1DFloatS32:
149   case NVPTXISD::Tex1DFloatFloat:
150   case NVPTXISD::Tex1DFloatFloatLevel:
151   case NVPTXISD::Tex1DFloatFloatGrad:
152   case NVPTXISD::Tex1DS32S32:
153   case NVPTXISD::Tex1DS32Float:
154   case NVPTXISD::Tex1DS32FloatLevel:
155   case NVPTXISD::Tex1DS32FloatGrad:
156   case NVPTXISD::Tex1DU32S32:
157   case NVPTXISD::Tex1DU32Float:
158   case NVPTXISD::Tex1DU32FloatLevel:
159   case NVPTXISD::Tex1DU32FloatGrad:
160   case NVPTXISD::Tex1DArrayFloatS32:
161   case NVPTXISD::Tex1DArrayFloatFloat:
162   case NVPTXISD::Tex1DArrayFloatFloatLevel:
163   case NVPTXISD::Tex1DArrayFloatFloatGrad:
164   case NVPTXISD::Tex1DArrayS32S32:
165   case NVPTXISD::Tex1DArrayS32Float:
166   case NVPTXISD::Tex1DArrayS32FloatLevel:
167   case NVPTXISD::Tex1DArrayS32FloatGrad:
168   case NVPTXISD::Tex1DArrayU32S32:
169   case NVPTXISD::Tex1DArrayU32Float:
170   case NVPTXISD::Tex1DArrayU32FloatLevel:
171   case NVPTXISD::Tex1DArrayU32FloatGrad:
172   case NVPTXISD::Tex2DFloatS32:
173   case NVPTXISD::Tex2DFloatFloat:
174   case NVPTXISD::Tex2DFloatFloatLevel:
175   case NVPTXISD::Tex2DFloatFloatGrad:
176   case NVPTXISD::Tex2DS32S32:
177   case NVPTXISD::Tex2DS32Float:
178   case NVPTXISD::Tex2DS32FloatLevel:
179   case NVPTXISD::Tex2DS32FloatGrad:
180   case NVPTXISD::Tex2DU32S32:
181   case NVPTXISD::Tex2DU32Float:
182   case NVPTXISD::Tex2DU32FloatLevel:
183   case NVPTXISD::Tex2DU32FloatGrad:
184   case NVPTXISD::Tex2DArrayFloatS32:
185   case NVPTXISD::Tex2DArrayFloatFloat:
186   case NVPTXISD::Tex2DArrayFloatFloatLevel:
187   case NVPTXISD::Tex2DArrayFloatFloatGrad:
188   case NVPTXISD::Tex2DArrayS32S32:
189   case NVPTXISD::Tex2DArrayS32Float:
190   case NVPTXISD::Tex2DArrayS32FloatLevel:
191   case NVPTXISD::Tex2DArrayS32FloatGrad:
192   case NVPTXISD::Tex2DArrayU32S32:
193   case NVPTXISD::Tex2DArrayU32Float:
194   case NVPTXISD::Tex2DArrayU32FloatLevel:
195   case NVPTXISD::Tex2DArrayU32FloatGrad:
196   case NVPTXISD::Tex3DFloatS32:
197   case NVPTXISD::Tex3DFloatFloat:
198   case NVPTXISD::Tex3DFloatFloatLevel:
199   case NVPTXISD::Tex3DFloatFloatGrad:
200   case NVPTXISD::Tex3DS32S32:
201   case NVPTXISD::Tex3DS32Float:
202   case NVPTXISD::Tex3DS32FloatLevel:
203   case NVPTXISD::Tex3DS32FloatGrad:
204   case NVPTXISD::Tex3DU32S32:
205   case NVPTXISD::Tex3DU32Float:
206   case NVPTXISD::Tex3DU32FloatLevel:
207   case NVPTXISD::Tex3DU32FloatGrad:
208   case NVPTXISD::TexCubeFloatFloat:
209   case NVPTXISD::TexCubeFloatFloatLevel:
210   case NVPTXISD::TexCubeS32Float:
211   case NVPTXISD::TexCubeS32FloatLevel:
212   case NVPTXISD::TexCubeU32Float:
213   case NVPTXISD::TexCubeU32FloatLevel:
214   case NVPTXISD::TexCubeArrayFloatFloat:
215   case NVPTXISD::TexCubeArrayFloatFloatLevel:
216   case NVPTXISD::TexCubeArrayS32Float:
217   case NVPTXISD::TexCubeArrayS32FloatLevel:
218   case NVPTXISD::TexCubeArrayU32Float:
219   case NVPTXISD::TexCubeArrayU32FloatLevel:
220   case NVPTXISD::Tld4R2DFloatFloat:
221   case NVPTXISD::Tld4G2DFloatFloat:
222   case NVPTXISD::Tld4B2DFloatFloat:
223   case NVPTXISD::Tld4A2DFloatFloat:
224   case NVPTXISD::Tld4R2DS64Float:
225   case NVPTXISD::Tld4G2DS64Float:
226   case NVPTXISD::Tld4B2DS64Float:
227   case NVPTXISD::Tld4A2DS64Float:
228   case NVPTXISD::Tld4R2DU64Float:
229   case NVPTXISD::Tld4G2DU64Float:
230   case NVPTXISD::Tld4B2DU64Float:
231   case NVPTXISD::Tld4A2DU64Float:
232   case NVPTXISD::TexUnified1DFloatS32:
233   case NVPTXISD::TexUnified1DFloatFloat:
234   case NVPTXISD::TexUnified1DFloatFloatLevel:
235   case NVPTXISD::TexUnified1DFloatFloatGrad:
236   case NVPTXISD::TexUnified1DS32S32:
237   case NVPTXISD::TexUnified1DS32Float:
238   case NVPTXISD::TexUnified1DS32FloatLevel:
239   case NVPTXISD::TexUnified1DS32FloatGrad:
240   case NVPTXISD::TexUnified1DU32S32:
241   case NVPTXISD::TexUnified1DU32Float:
242   case NVPTXISD::TexUnified1DU32FloatLevel:
243   case NVPTXISD::TexUnified1DU32FloatGrad:
244   case NVPTXISD::TexUnified1DArrayFloatS32:
245   case NVPTXISD::TexUnified1DArrayFloatFloat:
246   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
247   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
248   case NVPTXISD::TexUnified1DArrayS32S32:
249   case NVPTXISD::TexUnified1DArrayS32Float:
250   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
251   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
252   case NVPTXISD::TexUnified1DArrayU32S32:
253   case NVPTXISD::TexUnified1DArrayU32Float:
254   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
255   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
256   case NVPTXISD::TexUnified2DFloatS32:
257   case NVPTXISD::TexUnified2DFloatFloat:
258   case NVPTXISD::TexUnified2DFloatFloatLevel:
259   case NVPTXISD::TexUnified2DFloatFloatGrad:
260   case NVPTXISD::TexUnified2DS32S32:
261   case NVPTXISD::TexUnified2DS32Float:
262   case NVPTXISD::TexUnified2DS32FloatLevel:
263   case NVPTXISD::TexUnified2DS32FloatGrad:
264   case NVPTXISD::TexUnified2DU32S32:
265   case NVPTXISD::TexUnified2DU32Float:
266   case NVPTXISD::TexUnified2DU32FloatLevel:
267   case NVPTXISD::TexUnified2DU32FloatGrad:
268   case NVPTXISD::TexUnified2DArrayFloatS32:
269   case NVPTXISD::TexUnified2DArrayFloatFloat:
270   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
271   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
272   case NVPTXISD::TexUnified2DArrayS32S32:
273   case NVPTXISD::TexUnified2DArrayS32Float:
274   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
275   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
276   case NVPTXISD::TexUnified2DArrayU32S32:
277   case NVPTXISD::TexUnified2DArrayU32Float:
278   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
279   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
280   case NVPTXISD::TexUnified3DFloatS32:
281   case NVPTXISD::TexUnified3DFloatFloat:
282   case NVPTXISD::TexUnified3DFloatFloatLevel:
283   case NVPTXISD::TexUnified3DFloatFloatGrad:
284   case NVPTXISD::TexUnified3DS32S32:
285   case NVPTXISD::TexUnified3DS32Float:
286   case NVPTXISD::TexUnified3DS32FloatLevel:
287   case NVPTXISD::TexUnified3DS32FloatGrad:
288   case NVPTXISD::TexUnified3DU32S32:
289   case NVPTXISD::TexUnified3DU32Float:
290   case NVPTXISD::TexUnified3DU32FloatLevel:
291   case NVPTXISD::TexUnified3DU32FloatGrad:
292   case NVPTXISD::TexUnifiedCubeFloatFloat:
293   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
294   case NVPTXISD::TexUnifiedCubeS32Float:
295   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
296   case NVPTXISD::TexUnifiedCubeU32Float:
297   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
298   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
299   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
300   case NVPTXISD::TexUnifiedCubeArrayS32Float:
301   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
302   case NVPTXISD::TexUnifiedCubeArrayU32Float:
303   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
304   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
305   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
306   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
307   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
308   case NVPTXISD::Tld4UnifiedR2DS64Float:
309   case NVPTXISD::Tld4UnifiedG2DS64Float:
310   case NVPTXISD::Tld4UnifiedB2DS64Float:
311   case NVPTXISD::Tld4UnifiedA2DS64Float:
312   case NVPTXISD::Tld4UnifiedR2DU64Float:
313   case NVPTXISD::Tld4UnifiedG2DU64Float:
314   case NVPTXISD::Tld4UnifiedB2DU64Float:
315   case NVPTXISD::Tld4UnifiedA2DU64Float:
316     if (tryTextureIntrinsic(N))
317       return;
318     break;
319   case NVPTXISD::Suld1DI8Clamp:
320   case NVPTXISD::Suld1DI16Clamp:
321   case NVPTXISD::Suld1DI32Clamp:
322   case NVPTXISD::Suld1DI64Clamp:
323   case NVPTXISD::Suld1DV2I8Clamp:
324   case NVPTXISD::Suld1DV2I16Clamp:
325   case NVPTXISD::Suld1DV2I32Clamp:
326   case NVPTXISD::Suld1DV2I64Clamp:
327   case NVPTXISD::Suld1DV4I8Clamp:
328   case NVPTXISD::Suld1DV4I16Clamp:
329   case NVPTXISD::Suld1DV4I32Clamp:
330   case NVPTXISD::Suld1DArrayI8Clamp:
331   case NVPTXISD::Suld1DArrayI16Clamp:
332   case NVPTXISD::Suld1DArrayI32Clamp:
333   case NVPTXISD::Suld1DArrayI64Clamp:
334   case NVPTXISD::Suld1DArrayV2I8Clamp:
335   case NVPTXISD::Suld1DArrayV2I16Clamp:
336   case NVPTXISD::Suld1DArrayV2I32Clamp:
337   case NVPTXISD::Suld1DArrayV2I64Clamp:
338   case NVPTXISD::Suld1DArrayV4I8Clamp:
339   case NVPTXISD::Suld1DArrayV4I16Clamp:
340   case NVPTXISD::Suld1DArrayV4I32Clamp:
341   case NVPTXISD::Suld2DI8Clamp:
342   case NVPTXISD::Suld2DI16Clamp:
343   case NVPTXISD::Suld2DI32Clamp:
344   case NVPTXISD::Suld2DI64Clamp:
345   case NVPTXISD::Suld2DV2I8Clamp:
346   case NVPTXISD::Suld2DV2I16Clamp:
347   case NVPTXISD::Suld2DV2I32Clamp:
348   case NVPTXISD::Suld2DV2I64Clamp:
349   case NVPTXISD::Suld2DV4I8Clamp:
350   case NVPTXISD::Suld2DV4I16Clamp:
351   case NVPTXISD::Suld2DV4I32Clamp:
352   case NVPTXISD::Suld2DArrayI8Clamp:
353   case NVPTXISD::Suld2DArrayI16Clamp:
354   case NVPTXISD::Suld2DArrayI32Clamp:
355   case NVPTXISD::Suld2DArrayI64Clamp:
356   case NVPTXISD::Suld2DArrayV2I8Clamp:
357   case NVPTXISD::Suld2DArrayV2I16Clamp:
358   case NVPTXISD::Suld2DArrayV2I32Clamp:
359   case NVPTXISD::Suld2DArrayV2I64Clamp:
360   case NVPTXISD::Suld2DArrayV4I8Clamp:
361   case NVPTXISD::Suld2DArrayV4I16Clamp:
362   case NVPTXISD::Suld2DArrayV4I32Clamp:
363   case NVPTXISD::Suld3DI8Clamp:
364   case NVPTXISD::Suld3DI16Clamp:
365   case NVPTXISD::Suld3DI32Clamp:
366   case NVPTXISD::Suld3DI64Clamp:
367   case NVPTXISD::Suld3DV2I8Clamp:
368   case NVPTXISD::Suld3DV2I16Clamp:
369   case NVPTXISD::Suld3DV2I32Clamp:
370   case NVPTXISD::Suld3DV2I64Clamp:
371   case NVPTXISD::Suld3DV4I8Clamp:
372   case NVPTXISD::Suld3DV4I16Clamp:
373   case NVPTXISD::Suld3DV4I32Clamp:
374   case NVPTXISD::Suld1DI8Trap:
375   case NVPTXISD::Suld1DI16Trap:
376   case NVPTXISD::Suld1DI32Trap:
377   case NVPTXISD::Suld1DI64Trap:
378   case NVPTXISD::Suld1DV2I8Trap:
379   case NVPTXISD::Suld1DV2I16Trap:
380   case NVPTXISD::Suld1DV2I32Trap:
381   case NVPTXISD::Suld1DV2I64Trap:
382   case NVPTXISD::Suld1DV4I8Trap:
383   case NVPTXISD::Suld1DV4I16Trap:
384   case NVPTXISD::Suld1DV4I32Trap:
385   case NVPTXISD::Suld1DArrayI8Trap:
386   case NVPTXISD::Suld1DArrayI16Trap:
387   case NVPTXISD::Suld1DArrayI32Trap:
388   case NVPTXISD::Suld1DArrayI64Trap:
389   case NVPTXISD::Suld1DArrayV2I8Trap:
390   case NVPTXISD::Suld1DArrayV2I16Trap:
391   case NVPTXISD::Suld1DArrayV2I32Trap:
392   case NVPTXISD::Suld1DArrayV2I64Trap:
393   case NVPTXISD::Suld1DArrayV4I8Trap:
394   case NVPTXISD::Suld1DArrayV4I16Trap:
395   case NVPTXISD::Suld1DArrayV4I32Trap:
396   case NVPTXISD::Suld2DI8Trap:
397   case NVPTXISD::Suld2DI16Trap:
398   case NVPTXISD::Suld2DI32Trap:
399   case NVPTXISD::Suld2DI64Trap:
400   case NVPTXISD::Suld2DV2I8Trap:
401   case NVPTXISD::Suld2DV2I16Trap:
402   case NVPTXISD::Suld2DV2I32Trap:
403   case NVPTXISD::Suld2DV2I64Trap:
404   case NVPTXISD::Suld2DV4I8Trap:
405   case NVPTXISD::Suld2DV4I16Trap:
406   case NVPTXISD::Suld2DV4I32Trap:
407   case NVPTXISD::Suld2DArrayI8Trap:
408   case NVPTXISD::Suld2DArrayI16Trap:
409   case NVPTXISD::Suld2DArrayI32Trap:
410   case NVPTXISD::Suld2DArrayI64Trap:
411   case NVPTXISD::Suld2DArrayV2I8Trap:
412   case NVPTXISD::Suld2DArrayV2I16Trap:
413   case NVPTXISD::Suld2DArrayV2I32Trap:
414   case NVPTXISD::Suld2DArrayV2I64Trap:
415   case NVPTXISD::Suld2DArrayV4I8Trap:
416   case NVPTXISD::Suld2DArrayV4I16Trap:
417   case NVPTXISD::Suld2DArrayV4I32Trap:
418   case NVPTXISD::Suld3DI8Trap:
419   case NVPTXISD::Suld3DI16Trap:
420   case NVPTXISD::Suld3DI32Trap:
421   case NVPTXISD::Suld3DI64Trap:
422   case NVPTXISD::Suld3DV2I8Trap:
423   case NVPTXISD::Suld3DV2I16Trap:
424   case NVPTXISD::Suld3DV2I32Trap:
425   case NVPTXISD::Suld3DV2I64Trap:
426   case NVPTXISD::Suld3DV4I8Trap:
427   case NVPTXISD::Suld3DV4I16Trap:
428   case NVPTXISD::Suld3DV4I32Trap:
429   case NVPTXISD::Suld1DI8Zero:
430   case NVPTXISD::Suld1DI16Zero:
431   case NVPTXISD::Suld1DI32Zero:
432   case NVPTXISD::Suld1DI64Zero:
433   case NVPTXISD::Suld1DV2I8Zero:
434   case NVPTXISD::Suld1DV2I16Zero:
435   case NVPTXISD::Suld1DV2I32Zero:
436   case NVPTXISD::Suld1DV2I64Zero:
437   case NVPTXISD::Suld1DV4I8Zero:
438   case NVPTXISD::Suld1DV4I16Zero:
439   case NVPTXISD::Suld1DV4I32Zero:
440   case NVPTXISD::Suld1DArrayI8Zero:
441   case NVPTXISD::Suld1DArrayI16Zero:
442   case NVPTXISD::Suld1DArrayI32Zero:
443   case NVPTXISD::Suld1DArrayI64Zero:
444   case NVPTXISD::Suld1DArrayV2I8Zero:
445   case NVPTXISD::Suld1DArrayV2I16Zero:
446   case NVPTXISD::Suld1DArrayV2I32Zero:
447   case NVPTXISD::Suld1DArrayV2I64Zero:
448   case NVPTXISD::Suld1DArrayV4I8Zero:
449   case NVPTXISD::Suld1DArrayV4I16Zero:
450   case NVPTXISD::Suld1DArrayV4I32Zero:
451   case NVPTXISD::Suld2DI8Zero:
452   case NVPTXISD::Suld2DI16Zero:
453   case NVPTXISD::Suld2DI32Zero:
454   case NVPTXISD::Suld2DI64Zero:
455   case NVPTXISD::Suld2DV2I8Zero:
456   case NVPTXISD::Suld2DV2I16Zero:
457   case NVPTXISD::Suld2DV2I32Zero:
458   case NVPTXISD::Suld2DV2I64Zero:
459   case NVPTXISD::Suld2DV4I8Zero:
460   case NVPTXISD::Suld2DV4I16Zero:
461   case NVPTXISD::Suld2DV4I32Zero:
462   case NVPTXISD::Suld2DArrayI8Zero:
463   case NVPTXISD::Suld2DArrayI16Zero:
464   case NVPTXISD::Suld2DArrayI32Zero:
465   case NVPTXISD::Suld2DArrayI64Zero:
466   case NVPTXISD::Suld2DArrayV2I8Zero:
467   case NVPTXISD::Suld2DArrayV2I16Zero:
468   case NVPTXISD::Suld2DArrayV2I32Zero:
469   case NVPTXISD::Suld2DArrayV2I64Zero:
470   case NVPTXISD::Suld2DArrayV4I8Zero:
471   case NVPTXISD::Suld2DArrayV4I16Zero:
472   case NVPTXISD::Suld2DArrayV4I32Zero:
473   case NVPTXISD::Suld3DI8Zero:
474   case NVPTXISD::Suld3DI16Zero:
475   case NVPTXISD::Suld3DI32Zero:
476   case NVPTXISD::Suld3DI64Zero:
477   case NVPTXISD::Suld3DV2I8Zero:
478   case NVPTXISD::Suld3DV2I16Zero:
479   case NVPTXISD::Suld3DV2I32Zero:
480   case NVPTXISD::Suld3DV2I64Zero:
481   case NVPTXISD::Suld3DV4I8Zero:
482   case NVPTXISD::Suld3DV4I16Zero:
483   case NVPTXISD::Suld3DV4I32Zero:
484     if (trySurfaceIntrinsic(N))
485       return;
486     break;
487   case ISD::AND:
488   case ISD::SRA:
489   case ISD::SRL:
490     // Try to select BFE
491     if (tryBFE(N))
492       return;
493     break;
494   case ISD::ADDRSPACECAST:
495     SelectAddrSpaceCast(N);
496     return;
497   case ISD::ConstantFP:
498     if (tryConstantFP16(N))
499       return;
500     break;
501   default:
502     break;
503   }
504   SelectCode(N);
505 }
506 
507 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
508   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
509   switch (IID) {
510   default:
511     return false;
512   case Intrinsic::nvvm_ldg_global_f:
513   case Intrinsic::nvvm_ldg_global_i:
514   case Intrinsic::nvvm_ldg_global_p:
515   case Intrinsic::nvvm_ldu_global_f:
516   case Intrinsic::nvvm_ldu_global_i:
517   case Intrinsic::nvvm_ldu_global_p:
518     return tryLDGLDU(N);
519   }
520 }
521 
522 // There's no way to specify FP16 immediates in .f16 ops, so we have to
523 // load them into an .f16 register first.
524 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
525   if (N->getValueType(0) != MVT::f16)
526     return false;
527   SDValue Val = CurDAG->getTargetConstantFP(
528       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
529   SDNode *LoadConstF16 =
530       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
531   ReplaceNode(N, LoadConstF16);
532   return true;
533 }
534 
535 // Map ISD:CONDCODE value to appropriate CmpMode expected by
536 // NVPTXInstPrinter::printCmpMode()
537 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
538   using NVPTX::PTXCmpMode::CmpMode;
539   unsigned PTXCmpMode = [](ISD::CondCode CC) {
540     switch (CC) {
541     default:
542       llvm_unreachable("Unexpected condition code.");
543     case ISD::SETOEQ:
544       return CmpMode::EQ;
545     case ISD::SETOGT:
546       return CmpMode::GT;
547     case ISD::SETOGE:
548       return CmpMode::GE;
549     case ISD::SETOLT:
550       return CmpMode::LT;
551     case ISD::SETOLE:
552       return CmpMode::LE;
553     case ISD::SETONE:
554       return CmpMode::NE;
555     case ISD::SETO:
556       return CmpMode::NUM;
557     case ISD::SETUO:
558       return CmpMode::NotANumber;
559     case ISD::SETUEQ:
560       return CmpMode::EQU;
561     case ISD::SETUGT:
562       return CmpMode::GTU;
563     case ISD::SETUGE:
564       return CmpMode::GEU;
565     case ISD::SETULT:
566       return CmpMode::LTU;
567     case ISD::SETULE:
568       return CmpMode::LEU;
569     case ISD::SETUNE:
570       return CmpMode::NEU;
571     case ISD::SETEQ:
572       return CmpMode::EQ;
573     case ISD::SETGT:
574       return CmpMode::GT;
575     case ISD::SETGE:
576       return CmpMode::GE;
577     case ISD::SETLT:
578       return CmpMode::LT;
579     case ISD::SETLE:
580       return CmpMode::LE;
581     case ISD::SETNE:
582       return CmpMode::NE;
583     }
584   }(CondCode.get());
585 
586   if (FTZ)
587     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
588 
589   return PTXCmpMode;
590 }
591 
592 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
593   unsigned PTXCmpMode =
594       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
595   SDLoc DL(N);
596   SDNode *SetP = CurDAG->getMachineNode(
597       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
598       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
599   ReplaceNode(N, SetP);
600   return true;
601 }
602 
603 // Find all instances of extract_vector_elt that use this v2f16 vector
604 // and coalesce them into a scattering move instruction.
605 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
606   SDValue Vector = N->getOperand(0);
607 
608   // We only care about f16x2 as it's the only real vector type we
609   // need to deal with.
610   if (Vector.getSimpleValueType() != MVT::v2f16)
611     return false;
612 
613   // Find and record all uses of this vector that extract element 0 or 1.
614   SmallVector<SDNode *, 4> E0, E1;
615   for (auto U : Vector.getNode()->uses()) {
616     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
617       continue;
618     if (U->getOperand(0) != Vector)
619       continue;
620     if (const ConstantSDNode *IdxConst =
621             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
622       if (IdxConst->getZExtValue() == 0)
623         E0.push_back(U);
624       else if (IdxConst->getZExtValue() == 1)
625         E1.push_back(U);
626       else
627         llvm_unreachable("Invalid vector index.");
628     }
629   }
630 
631   // There's no point scattering f16x2 if we only ever access one
632   // element of it.
633   if (E0.empty() || E1.empty())
634     return false;
635 
636   unsigned Op = NVPTX::SplitF16x2;
637   // If the vector has been BITCAST'ed from i32, we can use original
638   // value directly and avoid register-to-register move.
639   SDValue Source = Vector;
640   if (Vector->getOpcode() == ISD::BITCAST) {
641     Op = NVPTX::SplitI32toF16x2;
642     Source = Vector->getOperand(0);
643   }
644   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645   // into f16,f16 SplitF16x2(V)
646   SDNode *ScatterOp =
647       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
648   for (auto *Node : E0)
649     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
650   for (auto *Node : E1)
651     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
652 
653   return true;
654 }
655 
656 static unsigned int getCodeAddrSpace(MemSDNode *N) {
657   const Value *Src = N->getMemOperand()->getValue();
658 
659   if (!Src)
660     return NVPTX::PTXLdStInstCode::GENERIC;
661 
662   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
663     switch (PT->getAddressSpace()) {
664     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
665     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
666     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
667     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
668     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
669     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
670     default: break;
671     }
672   }
673   return NVPTX::PTXLdStInstCode::GENERIC;
674 }
675 
676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
677                           unsigned CodeAddrSpace, MachineFunction *F) {
678   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
679   // space.
680   //
681   // We have two ways of identifying invariant loads: Loads may be explicitly
682   // marked as invariant, or we may infer them to be invariant.
683   //
684   // We currently infer invariance for loads from
685   //  - constant global variables, and
686   //  - kernel function pointer params that are noalias (i.e. __restrict) and
687   //    never written to.
688   //
689   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
690   // not during the SelectionDAG phase).
691   //
692   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
693   // explicitly invariant loads because these are how clang tells us to use ldg
694   // when the user uses a builtin.
695   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
696     return false;
697 
698   if (N->isInvariant())
699     return true;
700 
701   bool IsKernelFn = isKernelFunction(F->getFunction());
702 
703   // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
704   // because the former looks through phi nodes while the latter does not. We
705   // need to look through phi nodes to handle pointer induction variables.
706   SmallVector<const Value *, 8> Objs;
707   GetUnderlyingObjects(N->getMemOperand()->getValue(),
708                        Objs, F->getDataLayout());
709 
710   return all_of(Objs, [&](const Value *V) {
711     if (auto *A = dyn_cast<const Argument>(V))
712       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
713     if (auto *GV = dyn_cast<const GlobalVariable>(V))
714       return GV->isConstant();
715     return false;
716   });
717 }
718 
719 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
720   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
721   switch (IID) {
722   default:
723     return false;
724   case Intrinsic::nvvm_texsurf_handle_internal:
725     SelectTexSurfHandle(N);
726     return true;
727   }
728 }
729 
730 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
731   // Op 0 is the intrinsic ID
732   SDValue Wrapper = N->getOperand(1);
733   SDValue GlobalVal = Wrapper.getOperand(0);
734   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
735                                         MVT::i64, GlobalVal));
736 }
737 
738 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
739   SDValue Src = N->getOperand(0);
740   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
741   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
742   unsigned DstAddrSpace = CastN->getDestAddressSpace();
743   assert(SrcAddrSpace != DstAddrSpace &&
744          "addrspacecast must be between different address spaces");
745 
746   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
747     // Specific to generic
748     unsigned Opc;
749     switch (SrcAddrSpace) {
750     default: report_fatal_error("Bad address space in addrspacecast");
751     case ADDRESS_SPACE_GLOBAL:
752       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
753       break;
754     case ADDRESS_SPACE_SHARED:
755       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
756                                                : NVPTX::cvta_shared_yes_64)
757                          : NVPTX::cvta_shared_yes;
758       break;
759     case ADDRESS_SPACE_CONST:
760       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
761                                                : NVPTX::cvta_const_yes_64)
762                          : NVPTX::cvta_const_yes;
763       break;
764     case ADDRESS_SPACE_LOCAL:
765       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
766                                                : NVPTX::cvta_local_yes_64)
767                          : NVPTX::cvta_local_yes;
768       break;
769     }
770     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
771                                           Src));
772     return;
773   } else {
774     // Generic to specific
775     if (SrcAddrSpace != 0)
776       report_fatal_error("Cannot cast between two non-generic address spaces");
777     unsigned Opc;
778     switch (DstAddrSpace) {
779     default: report_fatal_error("Bad address space in addrspacecast");
780     case ADDRESS_SPACE_GLOBAL:
781       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
782                          : NVPTX::cvta_to_global_yes;
783       break;
784     case ADDRESS_SPACE_SHARED:
785       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
786                                                 : NVPTX::cvta_to_shared_yes_64)
787                          : NVPTX::cvta_to_shared_yes;
788       break;
789     case ADDRESS_SPACE_CONST:
790       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
791                                              : NVPTX::cvta_to_const_yes_64)
792                          : NVPTX::cvta_to_const_yes;
793       break;
794     case ADDRESS_SPACE_LOCAL:
795       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
796                                                : NVPTX::cvta_to_local_yes_64)
797                          : NVPTX::cvta_to_local_yes;
798       break;
799     case ADDRESS_SPACE_PARAM:
800       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
801                          : NVPTX::nvvm_ptr_gen_to_param;
802       break;
803     }
804     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
805                                           Src));
806     return;
807   }
808 }
809 
810 // Helper function template to reduce amount of boilerplate code for
811 // opcode selection.
812 static Optional<unsigned> pickOpcodeForVT(
813     MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
814     unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
815     unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
816   switch (VT) {
817   case MVT::i1:
818   case MVT::i8:
819     return Opcode_i8;
820   case MVT::i16:
821     return Opcode_i16;
822   case MVT::i32:
823     return Opcode_i32;
824   case MVT::i64:
825     return Opcode_i64;
826   case MVT::f16:
827     return Opcode_f16;
828   case MVT::v2f16:
829     return Opcode_f16x2;
830   case MVT::f32:
831     return Opcode_f32;
832   case MVT::f64:
833     return Opcode_f64;
834   default:
835     return None;
836   }
837 }
838 
839 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
840   SDLoc dl(N);
841   MemSDNode *LD = cast<MemSDNode>(N);
842   assert(LD->readMem() && "Expected load");
843   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
844   EVT LoadedVT = LD->getMemoryVT();
845   SDNode *NVPTXLD = nullptr;
846 
847   // do not support pre/post inc/dec
848   if (PlainLoad && PlainLoad->isIndexed())
849     return false;
850 
851   if (!LoadedVT.isSimple())
852     return false;
853 
854   AtomicOrdering Ordering = LD->getOrdering();
855   // In order to lower atomic loads with stronger guarantees we would need to
856   // use load.acquire or insert fences. However these features were only added
857   // with PTX ISA 6.0 / sm_70.
858   // TODO: Check if we can actually use the new instructions and implement them.
859   if (isStrongerThanMonotonic(Ordering))
860     return false;
861 
862   // Address Space Setting
863   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
864   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
865     return tryLDGLDU(N);
866   }
867 
868   unsigned int PointerSize =
869       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
870 
871   // Volatile Setting
872   // - .volatile is only available for .global and .shared
873   // - .volatile has the same memory synchronization semantics as .relaxed.sys
874   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
875   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
876       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
877       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
878     isVolatile = false;
879 
880   // Type Setting: fromType + fromTypeWidth
881   //
882   // Sign   : ISD::SEXTLOAD
883   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
884   //          type is integer
885   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
886   MVT SimpleVT = LoadedVT.getSimpleVT();
887   MVT ScalarVT = SimpleVT.getScalarType();
888   // Read at least 8 bits (predicates are stored as 8-bit values)
889   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
890   unsigned int fromType;
891 
892   // Vector Setting
893   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
894   if (SimpleVT.isVector()) {
895     assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
896     // v2f16 is loaded using ld.b32
897     fromTypeWidth = 32;
898   }
899 
900   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
901     fromType = NVPTX::PTXLdStInstCode::Signed;
902   else if (ScalarVT.isFloatingPoint())
903     // f16 uses .b16 as its storage type.
904     fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
905                                              : NVPTX::PTXLdStInstCode::Float;
906   else
907     fromType = NVPTX::PTXLdStInstCode::Unsigned;
908 
909   // Create the machine instruction DAG
910   SDValue Chain = N->getOperand(0);
911   SDValue N1 = N->getOperand(1);
912   SDValue Addr;
913   SDValue Offset, Base;
914   Optional<unsigned> Opcode;
915   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
916 
917   if (SelectDirectAddr(N1, Addr)) {
918     Opcode = pickOpcodeForVT(
919         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
920         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
921         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
922     if (!Opcode)
923       return false;
924     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
925                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
926                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
927     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
928                                      MVT::Other, Ops);
929   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
930                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
931     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
932                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
933                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
934                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
935     if (!Opcode)
936       return false;
937     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
938                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
939                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
940     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
941                                      MVT::Other, Ops);
942   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
943                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
944     if (PointerSize == 64)
945       Opcode = pickOpcodeForVT(
946           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
947           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
948           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
949     else
950       Opcode = pickOpcodeForVT(
951           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
952           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
953           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
954     if (!Opcode)
955       return false;
956     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
957                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
958                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
959     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
960                                      MVT::Other, Ops);
961   } else {
962     if (PointerSize == 64)
963       Opcode = pickOpcodeForVT(
964           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
965           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
966           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
967           NVPTX::LD_f64_areg_64);
968     else
969       Opcode = pickOpcodeForVT(
970           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
971           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
972           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
973     if (!Opcode)
974       return false;
975     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
976                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
977                       getI32Imm(fromTypeWidth, dl), N1, Chain };
978     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
979                                      MVT::Other, Ops);
980   }
981 
982   if (!NVPTXLD)
983     return false;
984 
985   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
986   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
987 
988   ReplaceNode(N, NVPTXLD);
989   return true;
990 }
991 
992 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
993 
994   SDValue Chain = N->getOperand(0);
995   SDValue Op1 = N->getOperand(1);
996   SDValue Addr, Offset, Base;
997   Optional<unsigned> Opcode;
998   SDLoc DL(N);
999   SDNode *LD;
1000   MemSDNode *MemSD = cast<MemSDNode>(N);
1001   EVT LoadedVT = MemSD->getMemoryVT();
1002 
1003   if (!LoadedVT.isSimple())
1004     return false;
1005 
1006   // Address Space Setting
1007   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1008   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1009     return tryLDGLDU(N);
1010   }
1011 
1012   unsigned int PointerSize =
1013       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1014 
1015   // Volatile Setting
1016   // - .volatile is only availalble for .global and .shared
1017   bool IsVolatile = MemSD->isVolatile();
1018   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1019       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1020       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1021     IsVolatile = false;
1022 
1023   // Vector Setting
1024   MVT SimpleVT = LoadedVT.getSimpleVT();
1025 
1026   // Type Setting: fromType + fromTypeWidth
1027   //
1028   // Sign   : ISD::SEXTLOAD
1029   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1030   //          type is integer
1031   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1032   MVT ScalarVT = SimpleVT.getScalarType();
1033   // Read at least 8 bits (predicates are stored as 8-bit values)
1034   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1035   unsigned int FromType;
1036   // The last operand holds the original LoadSDNode::getExtensionType() value
1037   unsigned ExtensionType = cast<ConstantSDNode>(
1038       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1039   if (ExtensionType == ISD::SEXTLOAD)
1040     FromType = NVPTX::PTXLdStInstCode::Signed;
1041   else if (ScalarVT.isFloatingPoint())
1042     FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1043                                              : NVPTX::PTXLdStInstCode::Float;
1044   else
1045     FromType = NVPTX::PTXLdStInstCode::Unsigned;
1046 
1047   unsigned VecType;
1048 
1049   switch (N->getOpcode()) {
1050   case NVPTXISD::LoadV2:
1051     VecType = NVPTX::PTXLdStInstCode::V2;
1052     break;
1053   case NVPTXISD::LoadV4:
1054     VecType = NVPTX::PTXLdStInstCode::V4;
1055     break;
1056   default:
1057     return false;
1058   }
1059 
1060   EVT EltVT = N->getValueType(0);
1061 
1062   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1063   // instruction. Instead, we split the vector into v2f16 chunks and
1064   // load them with ld.v4.b32.
1065   if (EltVT == MVT::v2f16) {
1066     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1067     EltVT = MVT::i32;
1068     FromType = NVPTX::PTXLdStInstCode::Untyped;
1069     FromTypeWidth = 32;
1070   }
1071 
1072   if (SelectDirectAddr(Op1, Addr)) {
1073     switch (N->getOpcode()) {
1074     default:
1075       return false;
1076     case NVPTXISD::LoadV2:
1077       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1078                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1079                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1080                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1081                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1082       break;
1083     case NVPTXISD::LoadV4:
1084       Opcode =
1085           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1086                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1087                           NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1088                           NVPTX::LDV_f32_v4_avar, None);
1089       break;
1090     }
1091     if (!Opcode)
1092       return false;
1093     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1094                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1095                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1096     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1097   } else if (PointerSize == 64
1098                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1099                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1100     switch (N->getOpcode()) {
1101     default:
1102       return false;
1103     case NVPTXISD::LoadV2:
1104       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1105                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1106                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1107                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1108                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1109       break;
1110     case NVPTXISD::LoadV4:
1111       Opcode =
1112           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1113                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1114                           NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1115                           NVPTX::LDV_f32_v4_asi, None);
1116       break;
1117     }
1118     if (!Opcode)
1119       return false;
1120     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1121                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1122                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1123     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1124   } else if (PointerSize == 64
1125                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1126                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1127     if (PointerSize == 64) {
1128       switch (N->getOpcode()) {
1129       default:
1130         return false;
1131       case NVPTXISD::LoadV2:
1132         Opcode = pickOpcodeForVT(
1133             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1134             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1135             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1136             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1137             NVPTX::LDV_f64_v2_ari_64);
1138         break;
1139       case NVPTXISD::LoadV4:
1140         Opcode = pickOpcodeForVT(
1141             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1142             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1143             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1144             NVPTX::LDV_f32_v4_ari_64, None);
1145         break;
1146       }
1147     } else {
1148       switch (N->getOpcode()) {
1149       default:
1150         return false;
1151       case NVPTXISD::LoadV2:
1152         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1153                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1154                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1155                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1156                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1157         break;
1158       case NVPTXISD::LoadV4:
1159         Opcode =
1160             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1161                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1162                             NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1163                             NVPTX::LDV_f32_v4_ari, None);
1164         break;
1165       }
1166     }
1167     if (!Opcode)
1168       return false;
1169     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1170                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1171                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1172 
1173     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1174   } else {
1175     if (PointerSize == 64) {
1176       switch (N->getOpcode()) {
1177       default:
1178         return false;
1179       case NVPTXISD::LoadV2:
1180         Opcode = pickOpcodeForVT(
1181             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1182             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1183             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1184             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1185             NVPTX::LDV_f64_v2_areg_64);
1186         break;
1187       case NVPTXISD::LoadV4:
1188         Opcode = pickOpcodeForVT(
1189             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1190             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1191             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1192             NVPTX::LDV_f32_v4_areg_64, None);
1193         break;
1194       }
1195     } else {
1196       switch (N->getOpcode()) {
1197       default:
1198         return false;
1199       case NVPTXISD::LoadV2:
1200         Opcode =
1201             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1202                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1203                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1204                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1205                             NVPTX::LDV_f64_v2_areg);
1206         break;
1207       case NVPTXISD::LoadV4:
1208         Opcode = pickOpcodeForVT(
1209             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1210             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1211             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1212             NVPTX::LDV_f32_v4_areg, None);
1213         break;
1214       }
1215     }
1216     if (!Opcode)
1217       return false;
1218     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1219                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1220                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1221     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1222   }
1223 
1224   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1225   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1226 
1227   ReplaceNode(N, LD);
1228   return true;
1229 }
1230 
1231 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1232 
1233   SDValue Chain = N->getOperand(0);
1234   SDValue Op1;
1235   MemSDNode *Mem;
1236   bool IsLDG = true;
1237 
1238   // If this is an LDG intrinsic, the address is the third operand. If its an
1239   // LDG/LDU SD node (from custom vector handling), then its the second operand
1240   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1241     Op1 = N->getOperand(2);
1242     Mem = cast<MemIntrinsicSDNode>(N);
1243     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1244     switch (IID) {
1245     default:
1246       return false;
1247     case Intrinsic::nvvm_ldg_global_f:
1248     case Intrinsic::nvvm_ldg_global_i:
1249     case Intrinsic::nvvm_ldg_global_p:
1250       IsLDG = true;
1251       break;
1252     case Intrinsic::nvvm_ldu_global_f:
1253     case Intrinsic::nvvm_ldu_global_i:
1254     case Intrinsic::nvvm_ldu_global_p:
1255       IsLDG = false;
1256       break;
1257     }
1258   } else {
1259     Op1 = N->getOperand(1);
1260     Mem = cast<MemSDNode>(N);
1261   }
1262 
1263   Optional<unsigned> Opcode;
1264   SDLoc DL(N);
1265   SDNode *LD;
1266   SDValue Base, Offset, Addr;
1267 
1268   EVT EltVT = Mem->getMemoryVT();
1269   unsigned NumElts = 1;
1270   if (EltVT.isVector()) {
1271     NumElts = EltVT.getVectorNumElements();
1272     EltVT = EltVT.getVectorElementType();
1273     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1274     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1275       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1276       EltVT = MVT::v2f16;
1277       NumElts /= 2;
1278     }
1279   }
1280 
1281   // Build the "promoted" result VTList for the load. If we are really loading
1282   // i8s, then the return type will be promoted to i16 since we do not expose
1283   // 8-bit registers in NVPTX.
1284   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1285   SmallVector<EVT, 5> InstVTs;
1286   for (unsigned i = 0; i != NumElts; ++i) {
1287     InstVTs.push_back(NodeVT);
1288   }
1289   InstVTs.push_back(MVT::Other);
1290   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1291 
1292   if (SelectDirectAddr(Op1, Addr)) {
1293     switch (N->getOpcode()) {
1294     default:
1295       return false;
1296     case ISD::LOAD:
1297     case ISD::INTRINSIC_W_CHAIN:
1298       if (IsLDG)
1299         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1300                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1301                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1302                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1303                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1304                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1305                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1306                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1307                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1308       else
1309         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1310                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1311                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1312                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1313                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1314                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1315                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1316                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1317                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1318       break;
1319     case NVPTXISD::LoadV2:
1320     case NVPTXISD::LDGV2:
1321       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1322                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1323                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1324                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1325                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1326                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1327                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1328                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1329                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1330       break;
1331     case NVPTXISD::LDUV2:
1332       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1333                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1334                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1335                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1336                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1337                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1338                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1339                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1340                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1341       break;
1342     case NVPTXISD::LoadV4:
1343     case NVPTXISD::LDGV4:
1344       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1345                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1346                                NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1347                                NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1348                                NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1349                                NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1350                                NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1351       break;
1352     case NVPTXISD::LDUV4:
1353       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1354                                NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1355                                NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1356                                NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1357                                NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1358                                NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1359                                NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1360       break;
1361     }
1362     if (!Opcode)
1363       return false;
1364     SDValue Ops[] = { Addr, Chain };
1365     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1366   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1367                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1368     if (TM.is64Bit()) {
1369       switch (N->getOpcode()) {
1370       default:
1371         return false;
1372       case ISD::LOAD:
1373       case ISD::INTRINSIC_W_CHAIN:
1374         if (IsLDG)
1375           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1376                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1377                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1378                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1379                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1380                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1381                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1382                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1383                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1384         else
1385           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1386                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1387                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1388                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1389                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1390                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1391                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1392                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1393                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1394         break;
1395       case NVPTXISD::LoadV2:
1396       case NVPTXISD::LDGV2:
1397         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1398                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1399                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1400                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1401                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1402                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1403                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1404                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1405                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1406         break;
1407       case NVPTXISD::LDUV2:
1408         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1409                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1410                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1411                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1412                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1413                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1414                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1415                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1416                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1417         break;
1418       case NVPTXISD::LoadV4:
1419       case NVPTXISD::LDGV4:
1420         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1421                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1422                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1423                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1424                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1425                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1426                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1427         break;
1428       case NVPTXISD::LDUV4:
1429         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1430                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1431                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1432                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1433                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1434                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1435                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1436         break;
1437       }
1438     } else {
1439       switch (N->getOpcode()) {
1440       default:
1441         return false;
1442       case ISD::LOAD:
1443       case ISD::INTRINSIC_W_CHAIN:
1444         if (IsLDG)
1445           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1446                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1447                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1448                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1449                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1450                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1451                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1452                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1453                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1454         else
1455           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1456                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1457                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1458                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1459                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1460                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1461                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1462                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1463                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1464         break;
1465       case NVPTXISD::LoadV2:
1466       case NVPTXISD::LDGV2:
1467         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1468                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1469                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1470                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1471                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1472                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1473                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1474                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1475                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1476         break;
1477       case NVPTXISD::LDUV2:
1478         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1479                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1480                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1481                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1482                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1483                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1484                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1485                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1486                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1487         break;
1488       case NVPTXISD::LoadV4:
1489       case NVPTXISD::LDGV4:
1490         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1491                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1492                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1493                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1494                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1495                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1496                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1497         break;
1498       case NVPTXISD::LDUV4:
1499         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1500                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1501                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1502                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1503                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1504                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1505                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1506         break;
1507       }
1508     }
1509     if (!Opcode)
1510       return false;
1511     SDValue Ops[] = {Base, Offset, Chain};
1512     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1513   } else {
1514     if (TM.is64Bit()) {
1515       switch (N->getOpcode()) {
1516       default:
1517         return false;
1518       case ISD::LOAD:
1519       case ISD::INTRINSIC_W_CHAIN:
1520         if (IsLDG)
1521           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1522                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1523                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1524                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1525                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1526                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1527                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1528                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1529                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1530         else
1531           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1532                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1533                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1534                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1535                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1536                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1537                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1538                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1539                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1540         break;
1541       case NVPTXISD::LoadV2:
1542       case NVPTXISD::LDGV2:
1543         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1544                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1545                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1546                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1547                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1548                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1549                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1550                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1551                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1552         break;
1553       case NVPTXISD::LDUV2:
1554         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1555                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1556                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1557                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1558                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1559                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1560                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1561                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1562                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1563         break;
1564       case NVPTXISD::LoadV4:
1565       case NVPTXISD::LDGV4:
1566         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1567                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1568                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1569                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1570                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1571                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1572                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1573         break;
1574       case NVPTXISD::LDUV4:
1575         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1576                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1577                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1578                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1579                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1580                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1581                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1582         break;
1583       }
1584     } else {
1585       switch (N->getOpcode()) {
1586       default:
1587         return false;
1588       case ISD::LOAD:
1589       case ISD::INTRINSIC_W_CHAIN:
1590         if (IsLDG)
1591           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1592                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1593                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1594                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1595                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1596                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1597                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1598                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1599                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1600         else
1601           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1602                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1603                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1604                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1605                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1606                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1607                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1608                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1609                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1610         break;
1611       case NVPTXISD::LoadV2:
1612       case NVPTXISD::LDGV2:
1613         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1614                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1615                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1616                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1617                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1618                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1619                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1620                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1621                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1622         break;
1623       case NVPTXISD::LDUV2:
1624         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1625                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1626                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1627                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1628                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1629                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1630                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1631                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1632                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1633         break;
1634       case NVPTXISD::LoadV4:
1635       case NVPTXISD::LDGV4:
1636         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1637                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1638                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1639                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1640                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1641                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1642                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1643         break;
1644       case NVPTXISD::LDUV4:
1645         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1646                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1647                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1648                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1649                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1650                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1651                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1652         break;
1653       }
1654     }
1655     if (!Opcode)
1656       return false;
1657     SDValue Ops[] = { Op1, Chain };
1658     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1659   }
1660 
1661   MachineMemOperand *MemRef = Mem->getMemOperand();
1662   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1663 
1664   // For automatic generation of LDG (through SelectLoad[Vector], not the
1665   // intrinsics), we may have an extending load like:
1666   //
1667   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1668   //
1669   // In this case, the matching logic above will select a load for the original
1670   // memory type (in this case, i8) and our types will not match (the node needs
1671   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1672   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1673   // CVT instruction. Ptxas should clean up any redundancies here.
1674 
1675   EVT OrigType = N->getValueType(0);
1676   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1677 
1678   if (OrigType != EltVT && LdNode) {
1679     // We have an extending-load. The instruction we selected operates on the
1680     // smaller type, but the SDNode we are replacing has the larger type. We
1681     // need to emit a CVT to make the types match.
1682     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1683     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1684                                        EltVT.getSimpleVT(), IsSigned);
1685 
1686     // For each output value, apply the manual sign/zero-extension and make sure
1687     // all users of the load go through that CVT.
1688     for (unsigned i = 0; i != NumElts; ++i) {
1689       SDValue Res(LD, i);
1690       SDValue OrigVal(N, i);
1691 
1692       SDNode *CvtNode =
1693         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1694                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1695                                                          DL, MVT::i32));
1696       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1697     }
1698   }
1699 
1700   ReplaceNode(N, LD);
1701   return true;
1702 }
1703 
1704 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1705   SDLoc dl(N);
1706   MemSDNode *ST = cast<MemSDNode>(N);
1707   assert(ST->writeMem() && "Expected store");
1708   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1709   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1710   assert((PlainStore || AtomicStore) && "Expected store");
1711   EVT StoreVT = ST->getMemoryVT();
1712   SDNode *NVPTXST = nullptr;
1713 
1714   // do not support pre/post inc/dec
1715   if (PlainStore && PlainStore->isIndexed())
1716     return false;
1717 
1718   if (!StoreVT.isSimple())
1719     return false;
1720 
1721   AtomicOrdering Ordering = ST->getOrdering();
1722   // In order to lower atomic loads with stronger guarantees we would need to
1723   // use store.release or insert fences. However these features were only added
1724   // with PTX ISA 6.0 / sm_70.
1725   // TODO: Check if we can actually use the new instructions and implement them.
1726   if (isStrongerThanMonotonic(Ordering))
1727     return false;
1728 
1729   // Address Space Setting
1730   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1731   unsigned int PointerSize =
1732       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1733 
1734   // Volatile Setting
1735   // - .volatile is only available for .global and .shared
1736   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1737   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1738   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1739       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1740       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1741     isVolatile = false;
1742 
1743   // Vector Setting
1744   MVT SimpleVT = StoreVT.getSimpleVT();
1745   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1746 
1747   // Type Setting: toType + toTypeWidth
1748   // - for integer type, always use 'u'
1749   //
1750   MVT ScalarVT = SimpleVT.getScalarType();
1751   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1752   if (SimpleVT.isVector()) {
1753     assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1754     // v2f16 is stored using st.b32
1755     toTypeWidth = 32;
1756   }
1757 
1758   unsigned int toType;
1759   if (ScalarVT.isFloatingPoint())
1760     // f16 uses .b16 as its storage type.
1761     toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1762                                            : NVPTX::PTXLdStInstCode::Float;
1763   else
1764     toType = NVPTX::PTXLdStInstCode::Unsigned;
1765 
1766   // Create the machine instruction DAG
1767   SDValue Chain = ST->getChain();
1768   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1769   SDValue BasePtr = ST->getBasePtr();
1770   SDValue Addr;
1771   SDValue Offset, Base;
1772   Optional<unsigned> Opcode;
1773   MVT::SimpleValueType SourceVT =
1774       Value.getNode()->getSimpleValueType(0).SimpleTy;
1775 
1776   if (SelectDirectAddr(BasePtr, Addr)) {
1777     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1778                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1779                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1780                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1781     if (!Opcode)
1782       return false;
1783     SDValue Ops[] = {Value,
1784                      getI32Imm(isVolatile, dl),
1785                      getI32Imm(CodeAddrSpace, dl),
1786                      getI32Imm(vecType, dl),
1787                      getI32Imm(toType, dl),
1788                      getI32Imm(toTypeWidth, dl),
1789                      Addr,
1790                      Chain};
1791     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1792   } else if (PointerSize == 64
1793                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1794                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1795     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1796                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1797                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1798                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1799     if (!Opcode)
1800       return false;
1801     SDValue Ops[] = {Value,
1802                      getI32Imm(isVolatile, dl),
1803                      getI32Imm(CodeAddrSpace, dl),
1804                      getI32Imm(vecType, dl),
1805                      getI32Imm(toType, dl),
1806                      getI32Imm(toTypeWidth, dl),
1807                      Base,
1808                      Offset,
1809                      Chain};
1810     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1811   } else if (PointerSize == 64
1812                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1813                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1814     if (PointerSize == 64)
1815       Opcode = pickOpcodeForVT(
1816           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1817           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1818           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1819     else
1820       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1821                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1822                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1823                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1824     if (!Opcode)
1825       return false;
1826 
1827     SDValue Ops[] = {Value,
1828                      getI32Imm(isVolatile, dl),
1829                      getI32Imm(CodeAddrSpace, dl),
1830                      getI32Imm(vecType, dl),
1831                      getI32Imm(toType, dl),
1832                      getI32Imm(toTypeWidth, dl),
1833                      Base,
1834                      Offset,
1835                      Chain};
1836     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1837   } else {
1838     if (PointerSize == 64)
1839       Opcode =
1840           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1841                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1842                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1843                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1844     else
1845       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1846                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1847                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1848                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1849     if (!Opcode)
1850       return false;
1851     SDValue Ops[] = {Value,
1852                      getI32Imm(isVolatile, dl),
1853                      getI32Imm(CodeAddrSpace, dl),
1854                      getI32Imm(vecType, dl),
1855                      getI32Imm(toType, dl),
1856                      getI32Imm(toTypeWidth, dl),
1857                      BasePtr,
1858                      Chain};
1859     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1860   }
1861 
1862   if (!NVPTXST)
1863     return false;
1864 
1865   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1866   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1867   ReplaceNode(N, NVPTXST);
1868   return true;
1869 }
1870 
1871 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1872   SDValue Chain = N->getOperand(0);
1873   SDValue Op1 = N->getOperand(1);
1874   SDValue Addr, Offset, Base;
1875   Optional<unsigned> Opcode;
1876   SDLoc DL(N);
1877   SDNode *ST;
1878   EVT EltVT = Op1.getValueType();
1879   MemSDNode *MemSD = cast<MemSDNode>(N);
1880   EVT StoreVT = MemSD->getMemoryVT();
1881 
1882   // Address Space Setting
1883   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1884   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1885     report_fatal_error("Cannot store to pointer that points to constant "
1886                        "memory space");
1887   }
1888   unsigned int PointerSize =
1889       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1890 
1891   // Volatile Setting
1892   // - .volatile is only availalble for .global and .shared
1893   bool IsVolatile = MemSD->isVolatile();
1894   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1895       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1896       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1897     IsVolatile = false;
1898 
1899   // Type Setting: toType + toTypeWidth
1900   // - for integer type, always use 'u'
1901   assert(StoreVT.isSimple() && "Store value is not simple");
1902   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1903   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1904   unsigned ToType;
1905   if (ScalarVT.isFloatingPoint())
1906     ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1907                                            : NVPTX::PTXLdStInstCode::Float;
1908   else
1909     ToType = NVPTX::PTXLdStInstCode::Unsigned;
1910 
1911   SmallVector<SDValue, 12> StOps;
1912   SDValue N2;
1913   unsigned VecType;
1914 
1915   switch (N->getOpcode()) {
1916   case NVPTXISD::StoreV2:
1917     VecType = NVPTX::PTXLdStInstCode::V2;
1918     StOps.push_back(N->getOperand(1));
1919     StOps.push_back(N->getOperand(2));
1920     N2 = N->getOperand(3);
1921     break;
1922   case NVPTXISD::StoreV4:
1923     VecType = NVPTX::PTXLdStInstCode::V4;
1924     StOps.push_back(N->getOperand(1));
1925     StOps.push_back(N->getOperand(2));
1926     StOps.push_back(N->getOperand(3));
1927     StOps.push_back(N->getOperand(4));
1928     N2 = N->getOperand(5);
1929     break;
1930   default:
1931     return false;
1932   }
1933 
1934   // v8f16 is a special case. PTX doesn't have st.v8.f16
1935   // instruction. Instead, we split the vector into v2f16 chunks and
1936   // store them with st.v4.b32.
1937   if (EltVT == MVT::v2f16) {
1938     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1939     EltVT = MVT::i32;
1940     ToType = NVPTX::PTXLdStInstCode::Untyped;
1941     ToTypeWidth = 32;
1942   }
1943 
1944   StOps.push_back(getI32Imm(IsVolatile, DL));
1945   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1946   StOps.push_back(getI32Imm(VecType, DL));
1947   StOps.push_back(getI32Imm(ToType, DL));
1948   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1949 
1950   if (SelectDirectAddr(N2, Addr)) {
1951     switch (N->getOpcode()) {
1952     default:
1953       return false;
1954     case NVPTXISD::StoreV2:
1955       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1956                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1957                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1958                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1959                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1960       break;
1961     case NVPTXISD::StoreV4:
1962       Opcode =
1963           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1964                           NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1965                           NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1966                           NVPTX::STV_f32_v4_avar, None);
1967       break;
1968     }
1969     StOps.push_back(Addr);
1970   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1971                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1972     switch (N->getOpcode()) {
1973     default:
1974       return false;
1975     case NVPTXISD::StoreV2:
1976       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1977                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1978                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1979                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1980                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1981       break;
1982     case NVPTXISD::StoreV4:
1983       Opcode =
1984           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1985                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1986                           NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1987                           NVPTX::STV_f32_v4_asi, None);
1988       break;
1989     }
1990     StOps.push_back(Base);
1991     StOps.push_back(Offset);
1992   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1993                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1994     if (PointerSize == 64) {
1995       switch (N->getOpcode()) {
1996       default:
1997         return false;
1998       case NVPTXISD::StoreV2:
1999         Opcode = pickOpcodeForVT(
2000             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2001             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2002             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2003             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2004             NVPTX::STV_f64_v2_ari_64);
2005         break;
2006       case NVPTXISD::StoreV4:
2007         Opcode = pickOpcodeForVT(
2008             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2009             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2010             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2011             NVPTX::STV_f32_v4_ari_64, None);
2012         break;
2013       }
2014     } else {
2015       switch (N->getOpcode()) {
2016       default:
2017         return false;
2018       case NVPTXISD::StoreV2:
2019         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2020                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2021                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2022                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2023                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2024         break;
2025       case NVPTXISD::StoreV4:
2026         Opcode =
2027             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2028                             NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2029                             NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2030                             NVPTX::STV_f32_v4_ari, None);
2031         break;
2032       }
2033     }
2034     StOps.push_back(Base);
2035     StOps.push_back(Offset);
2036   } else {
2037     if (PointerSize == 64) {
2038       switch (N->getOpcode()) {
2039       default:
2040         return false;
2041       case NVPTXISD::StoreV2:
2042         Opcode = pickOpcodeForVT(
2043             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2044             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2045             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2046             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2047             NVPTX::STV_f64_v2_areg_64);
2048         break;
2049       case NVPTXISD::StoreV4:
2050         Opcode = pickOpcodeForVT(
2051             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2052             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2053             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2054             NVPTX::STV_f32_v4_areg_64, None);
2055         break;
2056       }
2057     } else {
2058       switch (N->getOpcode()) {
2059       default:
2060         return false;
2061       case NVPTXISD::StoreV2:
2062         Opcode =
2063             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2064                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2065                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2066                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2067                             NVPTX::STV_f64_v2_areg);
2068         break;
2069       case NVPTXISD::StoreV4:
2070         Opcode =
2071             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2072                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2073                             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2074                             NVPTX::STV_f32_v4_areg, None);
2075         break;
2076       }
2077     }
2078     StOps.push_back(N2);
2079   }
2080 
2081   if (!Opcode)
2082     return false;
2083 
2084   StOps.push_back(Chain);
2085 
2086   ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2087 
2088   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2089   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2090 
2091   ReplaceNode(N, ST);
2092   return true;
2093 }
2094 
2095 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2096   SDValue Chain = Node->getOperand(0);
2097   SDValue Offset = Node->getOperand(2);
2098   SDValue Flag = Node->getOperand(3);
2099   SDLoc DL(Node);
2100   MemSDNode *Mem = cast<MemSDNode>(Node);
2101 
2102   unsigned VecSize;
2103   switch (Node->getOpcode()) {
2104   default:
2105     return false;
2106   case NVPTXISD::LoadParam:
2107     VecSize = 1;
2108     break;
2109   case NVPTXISD::LoadParamV2:
2110     VecSize = 2;
2111     break;
2112   case NVPTXISD::LoadParamV4:
2113     VecSize = 4;
2114     break;
2115   }
2116 
2117   EVT EltVT = Node->getValueType(0);
2118   EVT MemVT = Mem->getMemoryVT();
2119 
2120   Optional<unsigned> Opcode;
2121 
2122   switch (VecSize) {
2123   default:
2124     return false;
2125   case 1:
2126     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2127                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2128                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2129                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2130                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2131     break;
2132   case 2:
2133     Opcode =
2134         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2135                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2136                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2137                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2138                         NVPTX::LoadParamMemV2F64);
2139     break;
2140   case 4:
2141     Opcode = pickOpcodeForVT(
2142         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2143         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2144         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2145         NVPTX::LoadParamMemV4F32, None);
2146     break;
2147   }
2148   if (!Opcode)
2149     return false;
2150 
2151   SDVTList VTs;
2152   if (VecSize == 1) {
2153     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2154   } else if (VecSize == 2) {
2155     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2156   } else {
2157     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2158     VTs = CurDAG->getVTList(EVTs);
2159   }
2160 
2161   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2162 
2163   SmallVector<SDValue, 2> Ops;
2164   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2165   Ops.push_back(Chain);
2166   Ops.push_back(Flag);
2167 
2168   ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2169   return true;
2170 }
2171 
2172 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2173   SDLoc DL(N);
2174   SDValue Chain = N->getOperand(0);
2175   SDValue Offset = N->getOperand(1);
2176   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2177   MemSDNode *Mem = cast<MemSDNode>(N);
2178 
2179   // How many elements do we have?
2180   unsigned NumElts = 1;
2181   switch (N->getOpcode()) {
2182   default:
2183     return false;
2184   case NVPTXISD::StoreRetval:
2185     NumElts = 1;
2186     break;
2187   case NVPTXISD::StoreRetvalV2:
2188     NumElts = 2;
2189     break;
2190   case NVPTXISD::StoreRetvalV4:
2191     NumElts = 4;
2192     break;
2193   }
2194 
2195   // Build vector of operands
2196   SmallVector<SDValue, 6> Ops;
2197   for (unsigned i = 0; i < NumElts; ++i)
2198     Ops.push_back(N->getOperand(i + 2));
2199   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2200   Ops.push_back(Chain);
2201 
2202   // Determine target opcode
2203   // If we have an i1, use an 8-bit store. The lowering code in
2204   // NVPTXISelLowering will have already emitted an upcast.
2205   Optional<unsigned> Opcode = 0;
2206   switch (NumElts) {
2207   default:
2208     return false;
2209   case 1:
2210     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2211                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2212                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2213                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2214                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2215     break;
2216   case 2:
2217     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2218                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2219                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2220                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2221                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2222     break;
2223   case 4:
2224     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2225                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2226                              NVPTX::StoreRetvalV4I32, None,
2227                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2228                              NVPTX::StoreRetvalV4F32, None);
2229     break;
2230   }
2231   if (!Opcode)
2232     return false;
2233 
2234   SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2235   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2236   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2237 
2238   ReplaceNode(N, Ret);
2239   return true;
2240 }
2241 
2242 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2243   SDLoc DL(N);
2244   SDValue Chain = N->getOperand(0);
2245   SDValue Param = N->getOperand(1);
2246   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2247   SDValue Offset = N->getOperand(2);
2248   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2249   MemSDNode *Mem = cast<MemSDNode>(N);
2250   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2251 
2252   // How many elements do we have?
2253   unsigned NumElts = 1;
2254   switch (N->getOpcode()) {
2255   default:
2256     return false;
2257   case NVPTXISD::StoreParamU32:
2258   case NVPTXISD::StoreParamS32:
2259   case NVPTXISD::StoreParam:
2260     NumElts = 1;
2261     break;
2262   case NVPTXISD::StoreParamV2:
2263     NumElts = 2;
2264     break;
2265   case NVPTXISD::StoreParamV4:
2266     NumElts = 4;
2267     break;
2268   }
2269 
2270   // Build vector of operands
2271   SmallVector<SDValue, 8> Ops;
2272   for (unsigned i = 0; i < NumElts; ++i)
2273     Ops.push_back(N->getOperand(i + 3));
2274   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2275   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2276   Ops.push_back(Chain);
2277   Ops.push_back(Flag);
2278 
2279   // Determine target opcode
2280   // If we have an i1, use an 8-bit store. The lowering code in
2281   // NVPTXISelLowering will have already emitted an upcast.
2282   Optional<unsigned> Opcode = 0;
2283   switch (N->getOpcode()) {
2284   default:
2285     switch (NumElts) {
2286     default:
2287       return false;
2288     case 1:
2289       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2290                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2291                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2292                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2293                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2294       break;
2295     case 2:
2296       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2297                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2298                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2299                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2300                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2301       break;
2302     case 4:
2303       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2304                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2305                                NVPTX::StoreParamV4I32, None,
2306                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2307                                NVPTX::StoreParamV4F32, None);
2308       break;
2309     }
2310     if (!Opcode)
2311       return false;
2312     break;
2313   // Special case: if we have a sign-extend/zero-extend node, insert the
2314   // conversion instruction first, and use that as the value operand to
2315   // the selected StoreParam node.
2316   case NVPTXISD::StoreParamU32: {
2317     Opcode = NVPTX::StoreParamI32;
2318     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2319                                                 MVT::i32);
2320     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2321                                          MVT::i32, Ops[0], CvtNone);
2322     Ops[0] = SDValue(Cvt, 0);
2323     break;
2324   }
2325   case NVPTXISD::StoreParamS32: {
2326     Opcode = NVPTX::StoreParamI32;
2327     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2328                                                 MVT::i32);
2329     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2330                                          MVT::i32, Ops[0], CvtNone);
2331     Ops[0] = SDValue(Cvt, 0);
2332     break;
2333   }
2334   }
2335 
2336   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2337   SDNode *Ret =
2338       CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2339   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2340   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2341 
2342   ReplaceNode(N, Ret);
2343   return true;
2344 }
2345 
2346 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2347   unsigned Opc = 0;
2348 
2349   switch (N->getOpcode()) {
2350   default: return false;
2351   case NVPTXISD::Tex1DFloatS32:
2352     Opc = NVPTX::TEX_1D_F32_S32;
2353     break;
2354   case NVPTXISD::Tex1DFloatFloat:
2355     Opc = NVPTX::TEX_1D_F32_F32;
2356     break;
2357   case NVPTXISD::Tex1DFloatFloatLevel:
2358     Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2359     break;
2360   case NVPTXISD::Tex1DFloatFloatGrad:
2361     Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2362     break;
2363   case NVPTXISD::Tex1DS32S32:
2364     Opc = NVPTX::TEX_1D_S32_S32;
2365     break;
2366   case NVPTXISD::Tex1DS32Float:
2367     Opc = NVPTX::TEX_1D_S32_F32;
2368     break;
2369   case NVPTXISD::Tex1DS32FloatLevel:
2370     Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2371     break;
2372   case NVPTXISD::Tex1DS32FloatGrad:
2373     Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2374     break;
2375   case NVPTXISD::Tex1DU32S32:
2376     Opc = NVPTX::TEX_1D_U32_S32;
2377     break;
2378   case NVPTXISD::Tex1DU32Float:
2379     Opc = NVPTX::TEX_1D_U32_F32;
2380     break;
2381   case NVPTXISD::Tex1DU32FloatLevel:
2382     Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2383     break;
2384   case NVPTXISD::Tex1DU32FloatGrad:
2385     Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2386     break;
2387   case NVPTXISD::Tex1DArrayFloatS32:
2388     Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2389     break;
2390   case NVPTXISD::Tex1DArrayFloatFloat:
2391     Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2392     break;
2393   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2394     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2395     break;
2396   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2397     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2398     break;
2399   case NVPTXISD::Tex1DArrayS32S32:
2400     Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2401     break;
2402   case NVPTXISD::Tex1DArrayS32Float:
2403     Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2404     break;
2405   case NVPTXISD::Tex1DArrayS32FloatLevel:
2406     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2407     break;
2408   case NVPTXISD::Tex1DArrayS32FloatGrad:
2409     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2410     break;
2411   case NVPTXISD::Tex1DArrayU32S32:
2412     Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2413     break;
2414   case NVPTXISD::Tex1DArrayU32Float:
2415     Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2416     break;
2417   case NVPTXISD::Tex1DArrayU32FloatLevel:
2418     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2419     break;
2420   case NVPTXISD::Tex1DArrayU32FloatGrad:
2421     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2422     break;
2423   case NVPTXISD::Tex2DFloatS32:
2424     Opc = NVPTX::TEX_2D_F32_S32;
2425     break;
2426   case NVPTXISD::Tex2DFloatFloat:
2427     Opc = NVPTX::TEX_2D_F32_F32;
2428     break;
2429   case NVPTXISD::Tex2DFloatFloatLevel:
2430     Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2431     break;
2432   case NVPTXISD::Tex2DFloatFloatGrad:
2433     Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2434     break;
2435   case NVPTXISD::Tex2DS32S32:
2436     Opc = NVPTX::TEX_2D_S32_S32;
2437     break;
2438   case NVPTXISD::Tex2DS32Float:
2439     Opc = NVPTX::TEX_2D_S32_F32;
2440     break;
2441   case NVPTXISD::Tex2DS32FloatLevel:
2442     Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2443     break;
2444   case NVPTXISD::Tex2DS32FloatGrad:
2445     Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2446     break;
2447   case NVPTXISD::Tex2DU32S32:
2448     Opc = NVPTX::TEX_2D_U32_S32;
2449     break;
2450   case NVPTXISD::Tex2DU32Float:
2451     Opc = NVPTX::TEX_2D_U32_F32;
2452     break;
2453   case NVPTXISD::Tex2DU32FloatLevel:
2454     Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2455     break;
2456   case NVPTXISD::Tex2DU32FloatGrad:
2457     Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2458     break;
2459   case NVPTXISD::Tex2DArrayFloatS32:
2460     Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2461     break;
2462   case NVPTXISD::Tex2DArrayFloatFloat:
2463     Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2464     break;
2465   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2466     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2467     break;
2468   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2469     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2470     break;
2471   case NVPTXISD::Tex2DArrayS32S32:
2472     Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2473     break;
2474   case NVPTXISD::Tex2DArrayS32Float:
2475     Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2476     break;
2477   case NVPTXISD::Tex2DArrayS32FloatLevel:
2478     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2479     break;
2480   case NVPTXISD::Tex2DArrayS32FloatGrad:
2481     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2482     break;
2483   case NVPTXISD::Tex2DArrayU32S32:
2484     Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2485     break;
2486   case NVPTXISD::Tex2DArrayU32Float:
2487     Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2488     break;
2489   case NVPTXISD::Tex2DArrayU32FloatLevel:
2490     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2491     break;
2492   case NVPTXISD::Tex2DArrayU32FloatGrad:
2493     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2494     break;
2495   case NVPTXISD::Tex3DFloatS32:
2496     Opc = NVPTX::TEX_3D_F32_S32;
2497     break;
2498   case NVPTXISD::Tex3DFloatFloat:
2499     Opc = NVPTX::TEX_3D_F32_F32;
2500     break;
2501   case NVPTXISD::Tex3DFloatFloatLevel:
2502     Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2503     break;
2504   case NVPTXISD::Tex3DFloatFloatGrad:
2505     Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2506     break;
2507   case NVPTXISD::Tex3DS32S32:
2508     Opc = NVPTX::TEX_3D_S32_S32;
2509     break;
2510   case NVPTXISD::Tex3DS32Float:
2511     Opc = NVPTX::TEX_3D_S32_F32;
2512     break;
2513   case NVPTXISD::Tex3DS32FloatLevel:
2514     Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2515     break;
2516   case NVPTXISD::Tex3DS32FloatGrad:
2517     Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2518     break;
2519   case NVPTXISD::Tex3DU32S32:
2520     Opc = NVPTX::TEX_3D_U32_S32;
2521     break;
2522   case NVPTXISD::Tex3DU32Float:
2523     Opc = NVPTX::TEX_3D_U32_F32;
2524     break;
2525   case NVPTXISD::Tex3DU32FloatLevel:
2526     Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2527     break;
2528   case NVPTXISD::Tex3DU32FloatGrad:
2529     Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2530     break;
2531   case NVPTXISD::TexCubeFloatFloat:
2532     Opc = NVPTX::TEX_CUBE_F32_F32;
2533     break;
2534   case NVPTXISD::TexCubeFloatFloatLevel:
2535     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2536     break;
2537   case NVPTXISD::TexCubeS32Float:
2538     Opc = NVPTX::TEX_CUBE_S32_F32;
2539     break;
2540   case NVPTXISD::TexCubeS32FloatLevel:
2541     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2542     break;
2543   case NVPTXISD::TexCubeU32Float:
2544     Opc = NVPTX::TEX_CUBE_U32_F32;
2545     break;
2546   case NVPTXISD::TexCubeU32FloatLevel:
2547     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2548     break;
2549   case NVPTXISD::TexCubeArrayFloatFloat:
2550     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2551     break;
2552   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2553     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2554     break;
2555   case NVPTXISD::TexCubeArrayS32Float:
2556     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2557     break;
2558   case NVPTXISD::TexCubeArrayS32FloatLevel:
2559     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2560     break;
2561   case NVPTXISD::TexCubeArrayU32Float:
2562     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2563     break;
2564   case NVPTXISD::TexCubeArrayU32FloatLevel:
2565     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2566     break;
2567   case NVPTXISD::Tld4R2DFloatFloat:
2568     Opc = NVPTX::TLD4_R_2D_F32_F32;
2569     break;
2570   case NVPTXISD::Tld4G2DFloatFloat:
2571     Opc = NVPTX::TLD4_G_2D_F32_F32;
2572     break;
2573   case NVPTXISD::Tld4B2DFloatFloat:
2574     Opc = NVPTX::TLD4_B_2D_F32_F32;
2575     break;
2576   case NVPTXISD::Tld4A2DFloatFloat:
2577     Opc = NVPTX::TLD4_A_2D_F32_F32;
2578     break;
2579   case NVPTXISD::Tld4R2DS64Float:
2580     Opc = NVPTX::TLD4_R_2D_S32_F32;
2581     break;
2582   case NVPTXISD::Tld4G2DS64Float:
2583     Opc = NVPTX::TLD4_G_2D_S32_F32;
2584     break;
2585   case NVPTXISD::Tld4B2DS64Float:
2586     Opc = NVPTX::TLD4_B_2D_S32_F32;
2587     break;
2588   case NVPTXISD::Tld4A2DS64Float:
2589     Opc = NVPTX::TLD4_A_2D_S32_F32;
2590     break;
2591   case NVPTXISD::Tld4R2DU64Float:
2592     Opc = NVPTX::TLD4_R_2D_U32_F32;
2593     break;
2594   case NVPTXISD::Tld4G2DU64Float:
2595     Opc = NVPTX::TLD4_G_2D_U32_F32;
2596     break;
2597   case NVPTXISD::Tld4B2DU64Float:
2598     Opc = NVPTX::TLD4_B_2D_U32_F32;
2599     break;
2600   case NVPTXISD::Tld4A2DU64Float:
2601     Opc = NVPTX::TLD4_A_2D_U32_F32;
2602     break;
2603   case NVPTXISD::TexUnified1DFloatS32:
2604     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2605     break;
2606   case NVPTXISD::TexUnified1DFloatFloat:
2607     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2608     break;
2609   case NVPTXISD::TexUnified1DFloatFloatLevel:
2610     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2611     break;
2612   case NVPTXISD::TexUnified1DFloatFloatGrad:
2613     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2614     break;
2615   case NVPTXISD::TexUnified1DS32S32:
2616     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2617     break;
2618   case NVPTXISD::TexUnified1DS32Float:
2619     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2620     break;
2621   case NVPTXISD::TexUnified1DS32FloatLevel:
2622     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2623     break;
2624   case NVPTXISD::TexUnified1DS32FloatGrad:
2625     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2626     break;
2627   case NVPTXISD::TexUnified1DU32S32:
2628     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2629     break;
2630   case NVPTXISD::TexUnified1DU32Float:
2631     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2632     break;
2633   case NVPTXISD::TexUnified1DU32FloatLevel:
2634     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2635     break;
2636   case NVPTXISD::TexUnified1DU32FloatGrad:
2637     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2638     break;
2639   case NVPTXISD::TexUnified1DArrayFloatS32:
2640     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2641     break;
2642   case NVPTXISD::TexUnified1DArrayFloatFloat:
2643     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2644     break;
2645   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2646     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2647     break;
2648   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2649     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2650     break;
2651   case NVPTXISD::TexUnified1DArrayS32S32:
2652     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2653     break;
2654   case NVPTXISD::TexUnified1DArrayS32Float:
2655     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2656     break;
2657   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2658     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2659     break;
2660   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2661     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2662     break;
2663   case NVPTXISD::TexUnified1DArrayU32S32:
2664     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2665     break;
2666   case NVPTXISD::TexUnified1DArrayU32Float:
2667     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2668     break;
2669   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2670     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2671     break;
2672   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2673     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2674     break;
2675   case NVPTXISD::TexUnified2DFloatS32:
2676     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2677     break;
2678   case NVPTXISD::TexUnified2DFloatFloat:
2679     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2680     break;
2681   case NVPTXISD::TexUnified2DFloatFloatLevel:
2682     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2683     break;
2684   case NVPTXISD::TexUnified2DFloatFloatGrad:
2685     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2686     break;
2687   case NVPTXISD::TexUnified2DS32S32:
2688     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2689     break;
2690   case NVPTXISD::TexUnified2DS32Float:
2691     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2692     break;
2693   case NVPTXISD::TexUnified2DS32FloatLevel:
2694     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2695     break;
2696   case NVPTXISD::TexUnified2DS32FloatGrad:
2697     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
2698     break;
2699   case NVPTXISD::TexUnified2DU32S32:
2700     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
2701     break;
2702   case NVPTXISD::TexUnified2DU32Float:
2703     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
2704     break;
2705   case NVPTXISD::TexUnified2DU32FloatLevel:
2706     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
2707     break;
2708   case NVPTXISD::TexUnified2DU32FloatGrad:
2709     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
2710     break;
2711   case NVPTXISD::TexUnified2DArrayFloatS32:
2712     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
2713     break;
2714   case NVPTXISD::TexUnified2DArrayFloatFloat:
2715     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
2716     break;
2717   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2718     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
2719     break;
2720   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2721     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
2722     break;
2723   case NVPTXISD::TexUnified2DArrayS32S32:
2724     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
2725     break;
2726   case NVPTXISD::TexUnified2DArrayS32Float:
2727     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
2728     break;
2729   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2730     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
2731     break;
2732   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2733     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
2734     break;
2735   case NVPTXISD::TexUnified2DArrayU32S32:
2736     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
2737     break;
2738   case NVPTXISD::TexUnified2DArrayU32Float:
2739     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
2740     break;
2741   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2742     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
2743     break;
2744   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2745     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
2746     break;
2747   case NVPTXISD::TexUnified3DFloatS32:
2748     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
2749     break;
2750   case NVPTXISD::TexUnified3DFloatFloat:
2751     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
2752     break;
2753   case NVPTXISD::TexUnified3DFloatFloatLevel:
2754     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
2755     break;
2756   case NVPTXISD::TexUnified3DFloatFloatGrad:
2757     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
2758     break;
2759   case NVPTXISD::TexUnified3DS32S32:
2760     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
2761     break;
2762   case NVPTXISD::TexUnified3DS32Float:
2763     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
2764     break;
2765   case NVPTXISD::TexUnified3DS32FloatLevel:
2766     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
2767     break;
2768   case NVPTXISD::TexUnified3DS32FloatGrad:
2769     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
2770     break;
2771   case NVPTXISD::TexUnified3DU32S32:
2772     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
2773     break;
2774   case NVPTXISD::TexUnified3DU32Float:
2775     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
2776     break;
2777   case NVPTXISD::TexUnified3DU32FloatLevel:
2778     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
2779     break;
2780   case NVPTXISD::TexUnified3DU32FloatGrad:
2781     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
2782     break;
2783   case NVPTXISD::TexUnifiedCubeFloatFloat:
2784     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
2785     break;
2786   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2787     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
2788     break;
2789   case NVPTXISD::TexUnifiedCubeS32Float:
2790     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
2791     break;
2792   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2793     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
2794     break;
2795   case NVPTXISD::TexUnifiedCubeU32Float:
2796     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
2797     break;
2798   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2799     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
2800     break;
2801   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2802     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
2803     break;
2804   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2805     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
2806     break;
2807   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2808     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
2809     break;
2810   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2811     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
2812     break;
2813   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2814     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
2815     break;
2816   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2817     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
2818     break;
2819   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2820     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
2821     break;
2822   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2823     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
2824     break;
2825   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2826     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
2827     break;
2828   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2829     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
2830     break;
2831   case NVPTXISD::Tld4UnifiedR2DS64Float:
2832     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
2833     break;
2834   case NVPTXISD::Tld4UnifiedG2DS64Float:
2835     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
2836     break;
2837   case NVPTXISD::Tld4UnifiedB2DS64Float:
2838     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
2839     break;
2840   case NVPTXISD::Tld4UnifiedA2DS64Float:
2841     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
2842     break;
2843   case NVPTXISD::Tld4UnifiedR2DU64Float:
2844     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
2845     break;
2846   case NVPTXISD::Tld4UnifiedG2DU64Float:
2847     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
2848     break;
2849   case NVPTXISD::Tld4UnifiedB2DU64Float:
2850     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
2851     break;
2852   case NVPTXISD::Tld4UnifiedA2DU64Float:
2853     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
2854     break;
2855   }
2856 
2857   // Copy over operands
2858   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
2859   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2860 
2861   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2862   return true;
2863 }
2864 
2865 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2866   unsigned Opc = 0;
2867   switch (N->getOpcode()) {
2868   default: return false;
2869   case NVPTXISD::Suld1DI8Clamp:
2870     Opc = NVPTX::SULD_1D_I8_CLAMP;
2871     break;
2872   case NVPTXISD::Suld1DI16Clamp:
2873     Opc = NVPTX::SULD_1D_I16_CLAMP;
2874     break;
2875   case NVPTXISD::Suld1DI32Clamp:
2876     Opc = NVPTX::SULD_1D_I32_CLAMP;
2877     break;
2878   case NVPTXISD::Suld1DI64Clamp:
2879     Opc = NVPTX::SULD_1D_I64_CLAMP;
2880     break;
2881   case NVPTXISD::Suld1DV2I8Clamp:
2882     Opc = NVPTX::SULD_1D_V2I8_CLAMP;
2883     break;
2884   case NVPTXISD::Suld1DV2I16Clamp:
2885     Opc = NVPTX::SULD_1D_V2I16_CLAMP;
2886     break;
2887   case NVPTXISD::Suld1DV2I32Clamp:
2888     Opc = NVPTX::SULD_1D_V2I32_CLAMP;
2889     break;
2890   case NVPTXISD::Suld1DV2I64Clamp:
2891     Opc = NVPTX::SULD_1D_V2I64_CLAMP;
2892     break;
2893   case NVPTXISD::Suld1DV4I8Clamp:
2894     Opc = NVPTX::SULD_1D_V4I8_CLAMP;
2895     break;
2896   case NVPTXISD::Suld1DV4I16Clamp:
2897     Opc = NVPTX::SULD_1D_V4I16_CLAMP;
2898     break;
2899   case NVPTXISD::Suld1DV4I32Clamp:
2900     Opc = NVPTX::SULD_1D_V4I32_CLAMP;
2901     break;
2902   case NVPTXISD::Suld1DArrayI8Clamp:
2903     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
2904     break;
2905   case NVPTXISD::Suld1DArrayI16Clamp:
2906     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
2907     break;
2908   case NVPTXISD::Suld1DArrayI32Clamp:
2909     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
2910     break;
2911   case NVPTXISD::Suld1DArrayI64Clamp:
2912     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
2913     break;
2914   case NVPTXISD::Suld1DArrayV2I8Clamp:
2915     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
2916     break;
2917   case NVPTXISD::Suld1DArrayV2I16Clamp:
2918     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
2919     break;
2920   case NVPTXISD::Suld1DArrayV2I32Clamp:
2921     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
2922     break;
2923   case NVPTXISD::Suld1DArrayV2I64Clamp:
2924     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
2925     break;
2926   case NVPTXISD::Suld1DArrayV4I8Clamp:
2927     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
2928     break;
2929   case NVPTXISD::Suld1DArrayV4I16Clamp:
2930     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
2931     break;
2932   case NVPTXISD::Suld1DArrayV4I32Clamp:
2933     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
2934     break;
2935   case NVPTXISD::Suld2DI8Clamp:
2936     Opc = NVPTX::SULD_2D_I8_CLAMP;
2937     break;
2938   case NVPTXISD::Suld2DI16Clamp:
2939     Opc = NVPTX::SULD_2D_I16_CLAMP;
2940     break;
2941   case NVPTXISD::Suld2DI32Clamp:
2942     Opc = NVPTX::SULD_2D_I32_CLAMP;
2943     break;
2944   case NVPTXISD::Suld2DI64Clamp:
2945     Opc = NVPTX::SULD_2D_I64_CLAMP;
2946     break;
2947   case NVPTXISD::Suld2DV2I8Clamp:
2948     Opc = NVPTX::SULD_2D_V2I8_CLAMP;
2949     break;
2950   case NVPTXISD::Suld2DV2I16Clamp:
2951     Opc = NVPTX::SULD_2D_V2I16_CLAMP;
2952     break;
2953   case NVPTXISD::Suld2DV2I32Clamp:
2954     Opc = NVPTX::SULD_2D_V2I32_CLAMP;
2955     break;
2956   case NVPTXISD::Suld2DV2I64Clamp:
2957     Opc = NVPTX::SULD_2D_V2I64_CLAMP;
2958     break;
2959   case NVPTXISD::Suld2DV4I8Clamp:
2960     Opc = NVPTX::SULD_2D_V4I8_CLAMP;
2961     break;
2962   case NVPTXISD::Suld2DV4I16Clamp:
2963     Opc = NVPTX::SULD_2D_V4I16_CLAMP;
2964     break;
2965   case NVPTXISD::Suld2DV4I32Clamp:
2966     Opc = NVPTX::SULD_2D_V4I32_CLAMP;
2967     break;
2968   case NVPTXISD::Suld2DArrayI8Clamp:
2969     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
2970     break;
2971   case NVPTXISD::Suld2DArrayI16Clamp:
2972     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
2973     break;
2974   case NVPTXISD::Suld2DArrayI32Clamp:
2975     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
2976     break;
2977   case NVPTXISD::Suld2DArrayI64Clamp:
2978     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
2979     break;
2980   case NVPTXISD::Suld2DArrayV2I8Clamp:
2981     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
2982     break;
2983   case NVPTXISD::Suld2DArrayV2I16Clamp:
2984     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
2985     break;
2986   case NVPTXISD::Suld2DArrayV2I32Clamp:
2987     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
2988     break;
2989   case NVPTXISD::Suld2DArrayV2I64Clamp:
2990     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
2991     break;
2992   case NVPTXISD::Suld2DArrayV4I8Clamp:
2993     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
2994     break;
2995   case NVPTXISD::Suld2DArrayV4I16Clamp:
2996     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
2997     break;
2998   case NVPTXISD::Suld2DArrayV4I32Clamp:
2999     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
3000     break;
3001   case NVPTXISD::Suld3DI8Clamp:
3002     Opc = NVPTX::SULD_3D_I8_CLAMP;
3003     break;
3004   case NVPTXISD::Suld3DI16Clamp:
3005     Opc = NVPTX::SULD_3D_I16_CLAMP;
3006     break;
3007   case NVPTXISD::Suld3DI32Clamp:
3008     Opc = NVPTX::SULD_3D_I32_CLAMP;
3009     break;
3010   case NVPTXISD::Suld3DI64Clamp:
3011     Opc = NVPTX::SULD_3D_I64_CLAMP;
3012     break;
3013   case NVPTXISD::Suld3DV2I8Clamp:
3014     Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3015     break;
3016   case NVPTXISD::Suld3DV2I16Clamp:
3017     Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3018     break;
3019   case NVPTXISD::Suld3DV2I32Clamp:
3020     Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3021     break;
3022   case NVPTXISD::Suld3DV2I64Clamp:
3023     Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3024     break;
3025   case NVPTXISD::Suld3DV4I8Clamp:
3026     Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3027     break;
3028   case NVPTXISD::Suld3DV4I16Clamp:
3029     Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3030     break;
3031   case NVPTXISD::Suld3DV4I32Clamp:
3032     Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3033     break;
3034   case NVPTXISD::Suld1DI8Trap:
3035     Opc = NVPTX::SULD_1D_I8_TRAP;
3036     break;
3037   case NVPTXISD::Suld1DI16Trap:
3038     Opc = NVPTX::SULD_1D_I16_TRAP;
3039     break;
3040   case NVPTXISD::Suld1DI32Trap:
3041     Opc = NVPTX::SULD_1D_I32_TRAP;
3042     break;
3043   case NVPTXISD::Suld1DI64Trap:
3044     Opc = NVPTX::SULD_1D_I64_TRAP;
3045     break;
3046   case NVPTXISD::Suld1DV2I8Trap:
3047     Opc = NVPTX::SULD_1D_V2I8_TRAP;
3048     break;
3049   case NVPTXISD::Suld1DV2I16Trap:
3050     Opc = NVPTX::SULD_1D_V2I16_TRAP;
3051     break;
3052   case NVPTXISD::Suld1DV2I32Trap:
3053     Opc = NVPTX::SULD_1D_V2I32_TRAP;
3054     break;
3055   case NVPTXISD::Suld1DV2I64Trap:
3056     Opc = NVPTX::SULD_1D_V2I64_TRAP;
3057     break;
3058   case NVPTXISD::Suld1DV4I8Trap:
3059     Opc = NVPTX::SULD_1D_V4I8_TRAP;
3060     break;
3061   case NVPTXISD::Suld1DV4I16Trap:
3062     Opc = NVPTX::SULD_1D_V4I16_TRAP;
3063     break;
3064   case NVPTXISD::Suld1DV4I32Trap:
3065     Opc = NVPTX::SULD_1D_V4I32_TRAP;
3066     break;
3067   case NVPTXISD::Suld1DArrayI8Trap:
3068     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3069     break;
3070   case NVPTXISD::Suld1DArrayI16Trap:
3071     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3072     break;
3073   case NVPTXISD::Suld1DArrayI32Trap:
3074     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3075     break;
3076   case NVPTXISD::Suld1DArrayI64Trap:
3077     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3078     break;
3079   case NVPTXISD::Suld1DArrayV2I8Trap:
3080     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3081     break;
3082   case NVPTXISD::Suld1DArrayV2I16Trap:
3083     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3084     break;
3085   case NVPTXISD::Suld1DArrayV2I32Trap:
3086     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3087     break;
3088   case NVPTXISD::Suld1DArrayV2I64Trap:
3089     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3090     break;
3091   case NVPTXISD::Suld1DArrayV4I8Trap:
3092     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3093     break;
3094   case NVPTXISD::Suld1DArrayV4I16Trap:
3095     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3096     break;
3097   case NVPTXISD::Suld1DArrayV4I32Trap:
3098     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3099     break;
3100   case NVPTXISD::Suld2DI8Trap:
3101     Opc = NVPTX::SULD_2D_I8_TRAP;
3102     break;
3103   case NVPTXISD::Suld2DI16Trap:
3104     Opc = NVPTX::SULD_2D_I16_TRAP;
3105     break;
3106   case NVPTXISD::Suld2DI32Trap:
3107     Opc = NVPTX::SULD_2D_I32_TRAP;
3108     break;
3109   case NVPTXISD::Suld2DI64Trap:
3110     Opc = NVPTX::SULD_2D_I64_TRAP;
3111     break;
3112   case NVPTXISD::Suld2DV2I8Trap:
3113     Opc = NVPTX::SULD_2D_V2I8_TRAP;
3114     break;
3115   case NVPTXISD::Suld2DV2I16Trap:
3116     Opc = NVPTX::SULD_2D_V2I16_TRAP;
3117     break;
3118   case NVPTXISD::Suld2DV2I32Trap:
3119     Opc = NVPTX::SULD_2D_V2I32_TRAP;
3120     break;
3121   case NVPTXISD::Suld2DV2I64Trap:
3122     Opc = NVPTX::SULD_2D_V2I64_TRAP;
3123     break;
3124   case NVPTXISD::Suld2DV4I8Trap:
3125     Opc = NVPTX::SULD_2D_V4I8_TRAP;
3126     break;
3127   case NVPTXISD::Suld2DV4I16Trap:
3128     Opc = NVPTX::SULD_2D_V4I16_TRAP;
3129     break;
3130   case NVPTXISD::Suld2DV4I32Trap:
3131     Opc = NVPTX::SULD_2D_V4I32_TRAP;
3132     break;
3133   case NVPTXISD::Suld2DArrayI8Trap:
3134     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3135     break;
3136   case NVPTXISD::Suld2DArrayI16Trap:
3137     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3138     break;
3139   case NVPTXISD::Suld2DArrayI32Trap:
3140     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3141     break;
3142   case NVPTXISD::Suld2DArrayI64Trap:
3143     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3144     break;
3145   case NVPTXISD::Suld2DArrayV2I8Trap:
3146     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3147     break;
3148   case NVPTXISD::Suld2DArrayV2I16Trap:
3149     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3150     break;
3151   case NVPTXISD::Suld2DArrayV2I32Trap:
3152     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3153     break;
3154   case NVPTXISD::Suld2DArrayV2I64Trap:
3155     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3156     break;
3157   case NVPTXISD::Suld2DArrayV4I8Trap:
3158     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3159     break;
3160   case NVPTXISD::Suld2DArrayV4I16Trap:
3161     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3162     break;
3163   case NVPTXISD::Suld2DArrayV4I32Trap:
3164     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3165     break;
3166   case NVPTXISD::Suld3DI8Trap:
3167     Opc = NVPTX::SULD_3D_I8_TRAP;
3168     break;
3169   case NVPTXISD::Suld3DI16Trap:
3170     Opc = NVPTX::SULD_3D_I16_TRAP;
3171     break;
3172   case NVPTXISD::Suld3DI32Trap:
3173     Opc = NVPTX::SULD_3D_I32_TRAP;
3174     break;
3175   case NVPTXISD::Suld3DI64Trap:
3176     Opc = NVPTX::SULD_3D_I64_TRAP;
3177     break;
3178   case NVPTXISD::Suld3DV2I8Trap:
3179     Opc = NVPTX::SULD_3D_V2I8_TRAP;
3180     break;
3181   case NVPTXISD::Suld3DV2I16Trap:
3182     Opc = NVPTX::SULD_3D_V2I16_TRAP;
3183     break;
3184   case NVPTXISD::Suld3DV2I32Trap:
3185     Opc = NVPTX::SULD_3D_V2I32_TRAP;
3186     break;
3187   case NVPTXISD::Suld3DV2I64Trap:
3188     Opc = NVPTX::SULD_3D_V2I64_TRAP;
3189     break;
3190   case NVPTXISD::Suld3DV4I8Trap:
3191     Opc = NVPTX::SULD_3D_V4I8_TRAP;
3192     break;
3193   case NVPTXISD::Suld3DV4I16Trap:
3194     Opc = NVPTX::SULD_3D_V4I16_TRAP;
3195     break;
3196   case NVPTXISD::Suld3DV4I32Trap:
3197     Opc = NVPTX::SULD_3D_V4I32_TRAP;
3198     break;
3199   case NVPTXISD::Suld1DI8Zero:
3200     Opc = NVPTX::SULD_1D_I8_ZERO;
3201     break;
3202   case NVPTXISD::Suld1DI16Zero:
3203     Opc = NVPTX::SULD_1D_I16_ZERO;
3204     break;
3205   case NVPTXISD::Suld1DI32Zero:
3206     Opc = NVPTX::SULD_1D_I32_ZERO;
3207     break;
3208   case NVPTXISD::Suld1DI64Zero:
3209     Opc = NVPTX::SULD_1D_I64_ZERO;
3210     break;
3211   case NVPTXISD::Suld1DV2I8Zero:
3212     Opc = NVPTX::SULD_1D_V2I8_ZERO;
3213     break;
3214   case NVPTXISD::Suld1DV2I16Zero:
3215     Opc = NVPTX::SULD_1D_V2I16_ZERO;
3216     break;
3217   case NVPTXISD::Suld1DV2I32Zero:
3218     Opc = NVPTX::SULD_1D_V2I32_ZERO;
3219     break;
3220   case NVPTXISD::Suld1DV2I64Zero:
3221     Opc = NVPTX::SULD_1D_V2I64_ZERO;
3222     break;
3223   case NVPTXISD::Suld1DV4I8Zero:
3224     Opc = NVPTX::SULD_1D_V4I8_ZERO;
3225     break;
3226   case NVPTXISD::Suld1DV4I16Zero:
3227     Opc = NVPTX::SULD_1D_V4I16_ZERO;
3228     break;
3229   case NVPTXISD::Suld1DV4I32Zero:
3230     Opc = NVPTX::SULD_1D_V4I32_ZERO;
3231     break;
3232   case NVPTXISD::Suld1DArrayI8Zero:
3233     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3234     break;
3235   case NVPTXISD::Suld1DArrayI16Zero:
3236     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3237     break;
3238   case NVPTXISD::Suld1DArrayI32Zero:
3239     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3240     break;
3241   case NVPTXISD::Suld1DArrayI64Zero:
3242     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3243     break;
3244   case NVPTXISD::Suld1DArrayV2I8Zero:
3245     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3246     break;
3247   case NVPTXISD::Suld1DArrayV2I16Zero:
3248     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3249     break;
3250   case NVPTXISD::Suld1DArrayV2I32Zero:
3251     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3252     break;
3253   case NVPTXISD::Suld1DArrayV2I64Zero:
3254     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3255     break;
3256   case NVPTXISD::Suld1DArrayV4I8Zero:
3257     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3258     break;
3259   case NVPTXISD::Suld1DArrayV4I16Zero:
3260     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3261     break;
3262   case NVPTXISD::Suld1DArrayV4I32Zero:
3263     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3264     break;
3265   case NVPTXISD::Suld2DI8Zero:
3266     Opc = NVPTX::SULD_2D_I8_ZERO;
3267     break;
3268   case NVPTXISD::Suld2DI16Zero:
3269     Opc = NVPTX::SULD_2D_I16_ZERO;
3270     break;
3271   case NVPTXISD::Suld2DI32Zero:
3272     Opc = NVPTX::SULD_2D_I32_ZERO;
3273     break;
3274   case NVPTXISD::Suld2DI64Zero:
3275     Opc = NVPTX::SULD_2D_I64_ZERO;
3276     break;
3277   case NVPTXISD::Suld2DV2I8Zero:
3278     Opc = NVPTX::SULD_2D_V2I8_ZERO;
3279     break;
3280   case NVPTXISD::Suld2DV2I16Zero:
3281     Opc = NVPTX::SULD_2D_V2I16_ZERO;
3282     break;
3283   case NVPTXISD::Suld2DV2I32Zero:
3284     Opc = NVPTX::SULD_2D_V2I32_ZERO;
3285     break;
3286   case NVPTXISD::Suld2DV2I64Zero:
3287     Opc = NVPTX::SULD_2D_V2I64_ZERO;
3288     break;
3289   case NVPTXISD::Suld2DV4I8Zero:
3290     Opc = NVPTX::SULD_2D_V4I8_ZERO;
3291     break;
3292   case NVPTXISD::Suld2DV4I16Zero:
3293     Opc = NVPTX::SULD_2D_V4I16_ZERO;
3294     break;
3295   case NVPTXISD::Suld2DV4I32Zero:
3296     Opc = NVPTX::SULD_2D_V4I32_ZERO;
3297     break;
3298   case NVPTXISD::Suld2DArrayI8Zero:
3299     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3300     break;
3301   case NVPTXISD::Suld2DArrayI16Zero:
3302     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3303     break;
3304   case NVPTXISD::Suld2DArrayI32Zero:
3305     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3306     break;
3307   case NVPTXISD::Suld2DArrayI64Zero:
3308     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3309     break;
3310   case NVPTXISD::Suld2DArrayV2I8Zero:
3311     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3312     break;
3313   case NVPTXISD::Suld2DArrayV2I16Zero:
3314     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3315     break;
3316   case NVPTXISD::Suld2DArrayV2I32Zero:
3317     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3318     break;
3319   case NVPTXISD::Suld2DArrayV2I64Zero:
3320     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3321     break;
3322   case NVPTXISD::Suld2DArrayV4I8Zero:
3323     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3324     break;
3325   case NVPTXISD::Suld2DArrayV4I16Zero:
3326     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3327     break;
3328   case NVPTXISD::Suld2DArrayV4I32Zero:
3329     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3330     break;
3331   case NVPTXISD::Suld3DI8Zero:
3332     Opc = NVPTX::SULD_3D_I8_ZERO;
3333     break;
3334   case NVPTXISD::Suld3DI16Zero:
3335     Opc = NVPTX::SULD_3D_I16_ZERO;
3336     break;
3337   case NVPTXISD::Suld3DI32Zero:
3338     Opc = NVPTX::SULD_3D_I32_ZERO;
3339     break;
3340   case NVPTXISD::Suld3DI64Zero:
3341     Opc = NVPTX::SULD_3D_I64_ZERO;
3342     break;
3343   case NVPTXISD::Suld3DV2I8Zero:
3344     Opc = NVPTX::SULD_3D_V2I8_ZERO;
3345     break;
3346   case NVPTXISD::Suld3DV2I16Zero:
3347     Opc = NVPTX::SULD_3D_V2I16_ZERO;
3348     break;
3349   case NVPTXISD::Suld3DV2I32Zero:
3350     Opc = NVPTX::SULD_3D_V2I32_ZERO;
3351     break;
3352   case NVPTXISD::Suld3DV2I64Zero:
3353     Opc = NVPTX::SULD_3D_V2I64_ZERO;
3354     break;
3355   case NVPTXISD::Suld3DV4I8Zero:
3356     Opc = NVPTX::SULD_3D_V4I8_ZERO;
3357     break;
3358   case NVPTXISD::Suld3DV4I16Zero:
3359     Opc = NVPTX::SULD_3D_V4I16_ZERO;
3360     break;
3361   case NVPTXISD::Suld3DV4I32Zero:
3362     Opc = NVPTX::SULD_3D_V4I32_ZERO;
3363     break;
3364   }
3365 
3366   // Copy over operands
3367   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3368   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3369 
3370   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3371   return true;
3372 }
3373 
3374 
3375 /// SelectBFE - Look for instruction sequences that can be made more efficient
3376 /// by using the 'bfe' (bit-field extract) PTX instruction
3377 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3378   SDLoc DL(N);
3379   SDValue LHS = N->getOperand(0);
3380   SDValue RHS = N->getOperand(1);
3381   SDValue Len;
3382   SDValue Start;
3383   SDValue Val;
3384   bool IsSigned = false;
3385 
3386   if (N->getOpcode() == ISD::AND) {
3387     // Canonicalize the operands
3388     // We want 'and %val, %mask'
3389     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3390       std::swap(LHS, RHS);
3391     }
3392 
3393     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3394     if (!Mask) {
3395       // We need a constant mask on the RHS of the AND
3396       return false;
3397     }
3398 
3399     // Extract the mask bits
3400     uint64_t MaskVal = Mask->getZExtValue();
3401     if (!isMask_64(MaskVal)) {
3402       // We *could* handle shifted masks here, but doing so would require an
3403       // 'and' operation to fix up the low-order bits so we would trade
3404       // shr+and for bfe+and, which has the same throughput
3405       return false;
3406     }
3407 
3408     // How many bits are in our mask?
3409     uint64_t NumBits = countTrailingOnes(MaskVal);
3410     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3411 
3412     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3413       // We have a 'srl/and' pair, extract the effective start bit and length
3414       Val = LHS.getNode()->getOperand(0);
3415       Start = LHS.getNode()->getOperand(1);
3416       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3417       if (StartConst) {
3418         uint64_t StartVal = StartConst->getZExtValue();
3419         // How many "good" bits do we have left?  "good" is defined here as bits
3420         // that exist in the original value, not shifted in.
3421         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3422         if (NumBits > GoodBits) {
3423           // Do not handle the case where bits have been shifted in. In theory
3424           // we could handle this, but the cost is likely higher than just
3425           // emitting the srl/and pair.
3426           return false;
3427         }
3428         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3429       } else {
3430         // Do not handle the case where the shift amount (can be zero if no srl
3431         // was found) is not constant. We could handle this case, but it would
3432         // require run-time logic that would be more expensive than just
3433         // emitting the srl/and pair.
3434         return false;
3435       }
3436     } else {
3437       // Do not handle the case where the LHS of the and is not a shift. While
3438       // it would be trivial to handle this case, it would just transform
3439       // 'and' -> 'bfe', but 'and' has higher-throughput.
3440       return false;
3441     }
3442   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3443     if (LHS->getOpcode() == ISD::AND) {
3444       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3445       if (!ShiftCnst) {
3446         // Shift amount must be constant
3447         return false;
3448       }
3449 
3450       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3451 
3452       SDValue AndLHS = LHS->getOperand(0);
3453       SDValue AndRHS = LHS->getOperand(1);
3454 
3455       // Canonicalize the AND to have the mask on the RHS
3456       if (isa<ConstantSDNode>(AndLHS)) {
3457         std::swap(AndLHS, AndRHS);
3458       }
3459 
3460       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3461       if (!MaskCnst) {
3462         // Mask must be constant
3463         return false;
3464       }
3465 
3466       uint64_t MaskVal = MaskCnst->getZExtValue();
3467       uint64_t NumZeros;
3468       uint64_t NumBits;
3469       if (isMask_64(MaskVal)) {
3470         NumZeros = 0;
3471         // The number of bits in the result bitfield will be the number of
3472         // trailing ones (the AND) minus the number of bits we shift off
3473         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3474       } else if (isShiftedMask_64(MaskVal)) {
3475         NumZeros = countTrailingZeros(MaskVal);
3476         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3477         // The number of bits in the result bitfield will be the number of
3478         // trailing zeros plus the number of set bits in the mask minus the
3479         // number of bits we shift off
3480         NumBits = NumZeros + NumOnes - ShiftAmt;
3481       } else {
3482         // This is not a mask we can handle
3483         return false;
3484       }
3485 
3486       if (ShiftAmt < NumZeros) {
3487         // Handling this case would require extra logic that would make this
3488         // transformation non-profitable
3489         return false;
3490       }
3491 
3492       Val = AndLHS;
3493       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3494       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3495     } else if (LHS->getOpcode() == ISD::SHL) {
3496       // Here, we have a pattern like:
3497       //
3498       // (sra (shl val, NN), MM)
3499       // or
3500       // (srl (shl val, NN), MM)
3501       //
3502       // If MM >= NN, we can efficiently optimize this with bfe
3503       Val = LHS->getOperand(0);
3504 
3505       SDValue ShlRHS = LHS->getOperand(1);
3506       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3507       if (!ShlCnst) {
3508         // Shift amount must be constant
3509         return false;
3510       }
3511       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3512 
3513       SDValue ShrRHS = RHS;
3514       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3515       if (!ShrCnst) {
3516         // Shift amount must be constant
3517         return false;
3518       }
3519       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3520 
3521       // To avoid extra codegen and be profitable, we need Outer >= Inner
3522       if (OuterShiftAmt < InnerShiftAmt) {
3523         return false;
3524       }
3525 
3526       // If the outer shift is more than the type size, we have no bitfield to
3527       // extract (since we also check that the inner shift is <= the outer shift
3528       // then this also implies that the inner shift is < the type size)
3529       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3530         return false;
3531       }
3532 
3533       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3534                                         MVT::i32);
3535       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3536                                       DL, MVT::i32);
3537 
3538       if (N->getOpcode() == ISD::SRA) {
3539         // If we have a arithmetic right shift, we need to use the signed bfe
3540         // variant
3541         IsSigned = true;
3542       }
3543     } else {
3544       // No can do...
3545       return false;
3546     }
3547   } else {
3548     // No can do...
3549     return false;
3550   }
3551 
3552 
3553   unsigned Opc;
3554   // For the BFE operations we form here from "and" and "srl", always use the
3555   // unsigned variants.
3556   if (Val.getValueType() == MVT::i32) {
3557     if (IsSigned) {
3558       Opc = NVPTX::BFE_S32rii;
3559     } else {
3560       Opc = NVPTX::BFE_U32rii;
3561     }
3562   } else if (Val.getValueType() == MVT::i64) {
3563     if (IsSigned) {
3564       Opc = NVPTX::BFE_S64rii;
3565     } else {
3566       Opc = NVPTX::BFE_U64rii;
3567     }
3568   } else {
3569     // We cannot handle this type
3570     return false;
3571   }
3572 
3573   SDValue Ops[] = {
3574     Val, Start, Len
3575   };
3576 
3577   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3578   return true;
3579 }
3580 
3581 // SelectDirectAddr - Match a direct address for DAG.
3582 // A direct address could be a globaladdress or externalsymbol.
3583 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3584   // Return true if TGA or ES.
3585   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3586       N.getOpcode() == ISD::TargetExternalSymbol) {
3587     Address = N;
3588     return true;
3589   }
3590   if (N.getOpcode() == NVPTXISD::Wrapper) {
3591     Address = N.getOperand(0);
3592     return true;
3593   }
3594   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3595   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3596     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3597         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3598         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3599       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3600   }
3601   return false;
3602 }
3603 
3604 // symbol+offset
3605 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3606     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3607   if (Addr.getOpcode() == ISD::ADD) {
3608     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3609       SDValue base = Addr.getOperand(0);
3610       if (SelectDirectAddr(base, Base)) {
3611         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3612                                            mvt);
3613         return true;
3614       }
3615     }
3616   }
3617   return false;
3618 }
3619 
3620 // symbol+offset
3621 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3622                                      SDValue &Base, SDValue &Offset) {
3623   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3624 }
3625 
3626 // symbol+offset
3627 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3628                                        SDValue &Base, SDValue &Offset) {
3629   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3630 }
3631 
3632 // register+offset
3633 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3634     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3635   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3636     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3637     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3638     return true;
3639   }
3640   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3641       Addr.getOpcode() == ISD::TargetGlobalAddress)
3642     return false; // direct calls.
3643 
3644   if (Addr.getOpcode() == ISD::ADD) {
3645     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3646       return false;
3647     }
3648     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3649       if (FrameIndexSDNode *FIN =
3650               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3651         // Constant offset from frame ref.
3652         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3653       else
3654         Base = Addr.getOperand(0);
3655       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3656                                          mvt);
3657       return true;
3658     }
3659   }
3660   return false;
3661 }
3662 
3663 // register+offset
3664 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3665                                      SDValue &Base, SDValue &Offset) {
3666   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3667 }
3668 
3669 // register+offset
3670 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3671                                        SDValue &Base, SDValue &Offset) {
3672   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3673 }
3674 
3675 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3676                                                  unsigned int spN) const {
3677   const Value *Src = nullptr;
3678   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3679     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3680       return true;
3681     Src = mN->getMemOperand()->getValue();
3682   }
3683   if (!Src)
3684     return false;
3685   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3686     return (PT->getAddressSpace() == spN);
3687   return false;
3688 }
3689 
3690 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3691 /// inline asm expressions.
3692 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3693     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3694   SDValue Op0, Op1;
3695   switch (ConstraintID) {
3696   default:
3697     return true;
3698   case InlineAsm::Constraint_m: // memory
3699     if (SelectDirectAddr(Op, Op0)) {
3700       OutOps.push_back(Op0);
3701       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3702       return false;
3703     }
3704     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3705       OutOps.push_back(Op0);
3706       OutOps.push_back(Op1);
3707       return false;
3708     }
3709     break;
3710   }
3711   return true;
3712 }
3713 
3714 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3715 /// conversion from \p SrcTy to \p DestTy.
3716 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3717                                              bool IsSigned) {
3718   switch (SrcTy.SimpleTy) {
3719   default:
3720     llvm_unreachable("Unhandled source type");
3721   case MVT::i8:
3722     switch (DestTy.SimpleTy) {
3723     default:
3724       llvm_unreachable("Unhandled dest type");
3725     case MVT::i16:
3726       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3727     case MVT::i32:
3728       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3729     case MVT::i64:
3730       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3731     }
3732   case MVT::i16:
3733     switch (DestTy.SimpleTy) {
3734     default:
3735       llvm_unreachable("Unhandled dest type");
3736     case MVT::i8:
3737       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3738     case MVT::i32:
3739       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3740     case MVT::i64:
3741       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3742     }
3743   case MVT::i32:
3744     switch (DestTy.SimpleTy) {
3745     default:
3746       llvm_unreachable("Unhandled dest type");
3747     case MVT::i8:
3748       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3749     case MVT::i16:
3750       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3751     case MVT::i64:
3752       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3753     }
3754   case MVT::i64:
3755     switch (DestTy.SimpleTy) {
3756     default:
3757       llvm_unreachable("Unhandled dest type");
3758     case MVT::i8:
3759       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3760     case MVT::i16:
3761       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3762     case MVT::i32:
3763       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3764     }
3765   }
3766 }
3767