xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (revision 3e8eb5c7f4909209c042403ddee340b2ee7003a5)
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "nvptx-isel"
30 
31 /// createNVPTXISelDag - This pass converts a legalized DAG into a
32 /// NVPTX-specific DAG, ready for instruction scheduling.
33 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
34                                        llvm::CodeGenOpt::Level OptLevel) {
35   return new NVPTXDAGToDAGISel(TM, OptLevel);
36 }
37 
38 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
39                                      CodeGenOpt::Level OptLevel)
40     : SelectionDAGISel(tm, OptLevel), TM(tm) {
41   doMulWide = (OptLevel > 0);
42 }
43 
44 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
45   Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
46   return SelectionDAGISel::runOnMachineFunction(MF);
47 }
48 
49 int NVPTXDAGToDAGISel::getDivF32Level() const {
50   return Subtarget->getTargetLowering()->getDivF32Level();
51 }
52 
53 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
54   return Subtarget->getTargetLowering()->usePrecSqrtF32();
55 }
56 
57 bool NVPTXDAGToDAGISel::useF32FTZ() const {
58   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
59 }
60 
61 bool NVPTXDAGToDAGISel::allowFMA() const {
62   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
63   return TL->allowFMA(*MF, OptLevel);
64 }
65 
66 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
67   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
68   return TL->allowUnsafeFPMath(*MF);
69 }
70 
71 bool NVPTXDAGToDAGISel::useShortPointers() const {
72   return TM.useShortPointers();
73 }
74 
75 /// Select - Select instructions not customized! Used for
76 /// expanded, promoted and normal instructions.
77 void NVPTXDAGToDAGISel::Select(SDNode *N) {
78 
79   if (N->isMachineOpcode()) {
80     N->setNodeId(-1);
81     return; // Already selected.
82   }
83 
84   switch (N->getOpcode()) {
85   case ISD::LOAD:
86   case ISD::ATOMIC_LOAD:
87     if (tryLoad(N))
88       return;
89     break;
90   case ISD::STORE:
91   case ISD::ATOMIC_STORE:
92     if (tryStore(N))
93       return;
94     break;
95   case ISD::EXTRACT_VECTOR_ELT:
96     if (tryEXTRACT_VECTOR_ELEMENT(N))
97       return;
98     break;
99   case NVPTXISD::SETP_F16X2:
100     SelectSETP_F16X2(N);
101     return;
102 
103   case NVPTXISD::LoadV2:
104   case NVPTXISD::LoadV4:
105     if (tryLoadVector(N))
106       return;
107     break;
108   case NVPTXISD::LDGV2:
109   case NVPTXISD::LDGV4:
110   case NVPTXISD::LDUV2:
111   case NVPTXISD::LDUV4:
112     if (tryLDGLDU(N))
113       return;
114     break;
115   case NVPTXISD::StoreV2:
116   case NVPTXISD::StoreV4:
117     if (tryStoreVector(N))
118       return;
119     break;
120   case NVPTXISD::LoadParam:
121   case NVPTXISD::LoadParamV2:
122   case NVPTXISD::LoadParamV4:
123     if (tryLoadParam(N))
124       return;
125     break;
126   case NVPTXISD::StoreRetval:
127   case NVPTXISD::StoreRetvalV2:
128   case NVPTXISD::StoreRetvalV4:
129     if (tryStoreRetval(N))
130       return;
131     break;
132   case NVPTXISD::StoreParam:
133   case NVPTXISD::StoreParamV2:
134   case NVPTXISD::StoreParamV4:
135   case NVPTXISD::StoreParamS32:
136   case NVPTXISD::StoreParamU32:
137     if (tryStoreParam(N))
138       return;
139     break;
140   case ISD::INTRINSIC_WO_CHAIN:
141     if (tryIntrinsicNoChain(N))
142       return;
143     break;
144   case ISD::INTRINSIC_W_CHAIN:
145     if (tryIntrinsicChain(N))
146       return;
147     break;
148   case NVPTXISD::Tex1DFloatS32:
149   case NVPTXISD::Tex1DFloatFloat:
150   case NVPTXISD::Tex1DFloatFloatLevel:
151   case NVPTXISD::Tex1DFloatFloatGrad:
152   case NVPTXISD::Tex1DS32S32:
153   case NVPTXISD::Tex1DS32Float:
154   case NVPTXISD::Tex1DS32FloatLevel:
155   case NVPTXISD::Tex1DS32FloatGrad:
156   case NVPTXISD::Tex1DU32S32:
157   case NVPTXISD::Tex1DU32Float:
158   case NVPTXISD::Tex1DU32FloatLevel:
159   case NVPTXISD::Tex1DU32FloatGrad:
160   case NVPTXISD::Tex1DArrayFloatS32:
161   case NVPTXISD::Tex1DArrayFloatFloat:
162   case NVPTXISD::Tex1DArrayFloatFloatLevel:
163   case NVPTXISD::Tex1DArrayFloatFloatGrad:
164   case NVPTXISD::Tex1DArrayS32S32:
165   case NVPTXISD::Tex1DArrayS32Float:
166   case NVPTXISD::Tex1DArrayS32FloatLevel:
167   case NVPTXISD::Tex1DArrayS32FloatGrad:
168   case NVPTXISD::Tex1DArrayU32S32:
169   case NVPTXISD::Tex1DArrayU32Float:
170   case NVPTXISD::Tex1DArrayU32FloatLevel:
171   case NVPTXISD::Tex1DArrayU32FloatGrad:
172   case NVPTXISD::Tex2DFloatS32:
173   case NVPTXISD::Tex2DFloatFloat:
174   case NVPTXISD::Tex2DFloatFloatLevel:
175   case NVPTXISD::Tex2DFloatFloatGrad:
176   case NVPTXISD::Tex2DS32S32:
177   case NVPTXISD::Tex2DS32Float:
178   case NVPTXISD::Tex2DS32FloatLevel:
179   case NVPTXISD::Tex2DS32FloatGrad:
180   case NVPTXISD::Tex2DU32S32:
181   case NVPTXISD::Tex2DU32Float:
182   case NVPTXISD::Tex2DU32FloatLevel:
183   case NVPTXISD::Tex2DU32FloatGrad:
184   case NVPTXISD::Tex2DArrayFloatS32:
185   case NVPTXISD::Tex2DArrayFloatFloat:
186   case NVPTXISD::Tex2DArrayFloatFloatLevel:
187   case NVPTXISD::Tex2DArrayFloatFloatGrad:
188   case NVPTXISD::Tex2DArrayS32S32:
189   case NVPTXISD::Tex2DArrayS32Float:
190   case NVPTXISD::Tex2DArrayS32FloatLevel:
191   case NVPTXISD::Tex2DArrayS32FloatGrad:
192   case NVPTXISD::Tex2DArrayU32S32:
193   case NVPTXISD::Tex2DArrayU32Float:
194   case NVPTXISD::Tex2DArrayU32FloatLevel:
195   case NVPTXISD::Tex2DArrayU32FloatGrad:
196   case NVPTXISD::Tex3DFloatS32:
197   case NVPTXISD::Tex3DFloatFloat:
198   case NVPTXISD::Tex3DFloatFloatLevel:
199   case NVPTXISD::Tex3DFloatFloatGrad:
200   case NVPTXISD::Tex3DS32S32:
201   case NVPTXISD::Tex3DS32Float:
202   case NVPTXISD::Tex3DS32FloatLevel:
203   case NVPTXISD::Tex3DS32FloatGrad:
204   case NVPTXISD::Tex3DU32S32:
205   case NVPTXISD::Tex3DU32Float:
206   case NVPTXISD::Tex3DU32FloatLevel:
207   case NVPTXISD::Tex3DU32FloatGrad:
208   case NVPTXISD::TexCubeFloatFloat:
209   case NVPTXISD::TexCubeFloatFloatLevel:
210   case NVPTXISD::TexCubeS32Float:
211   case NVPTXISD::TexCubeS32FloatLevel:
212   case NVPTXISD::TexCubeU32Float:
213   case NVPTXISD::TexCubeU32FloatLevel:
214   case NVPTXISD::TexCubeArrayFloatFloat:
215   case NVPTXISD::TexCubeArrayFloatFloatLevel:
216   case NVPTXISD::TexCubeArrayS32Float:
217   case NVPTXISD::TexCubeArrayS32FloatLevel:
218   case NVPTXISD::TexCubeArrayU32Float:
219   case NVPTXISD::TexCubeArrayU32FloatLevel:
220   case NVPTXISD::Tld4R2DFloatFloat:
221   case NVPTXISD::Tld4G2DFloatFloat:
222   case NVPTXISD::Tld4B2DFloatFloat:
223   case NVPTXISD::Tld4A2DFloatFloat:
224   case NVPTXISD::Tld4R2DS64Float:
225   case NVPTXISD::Tld4G2DS64Float:
226   case NVPTXISD::Tld4B2DS64Float:
227   case NVPTXISD::Tld4A2DS64Float:
228   case NVPTXISD::Tld4R2DU64Float:
229   case NVPTXISD::Tld4G2DU64Float:
230   case NVPTXISD::Tld4B2DU64Float:
231   case NVPTXISD::Tld4A2DU64Float:
232   case NVPTXISD::TexUnified1DFloatS32:
233   case NVPTXISD::TexUnified1DFloatFloat:
234   case NVPTXISD::TexUnified1DFloatFloatLevel:
235   case NVPTXISD::TexUnified1DFloatFloatGrad:
236   case NVPTXISD::TexUnified1DS32S32:
237   case NVPTXISD::TexUnified1DS32Float:
238   case NVPTXISD::TexUnified1DS32FloatLevel:
239   case NVPTXISD::TexUnified1DS32FloatGrad:
240   case NVPTXISD::TexUnified1DU32S32:
241   case NVPTXISD::TexUnified1DU32Float:
242   case NVPTXISD::TexUnified1DU32FloatLevel:
243   case NVPTXISD::TexUnified1DU32FloatGrad:
244   case NVPTXISD::TexUnified1DArrayFloatS32:
245   case NVPTXISD::TexUnified1DArrayFloatFloat:
246   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
247   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
248   case NVPTXISD::TexUnified1DArrayS32S32:
249   case NVPTXISD::TexUnified1DArrayS32Float:
250   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
251   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
252   case NVPTXISD::TexUnified1DArrayU32S32:
253   case NVPTXISD::TexUnified1DArrayU32Float:
254   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
255   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
256   case NVPTXISD::TexUnified2DFloatS32:
257   case NVPTXISD::TexUnified2DFloatFloat:
258   case NVPTXISD::TexUnified2DFloatFloatLevel:
259   case NVPTXISD::TexUnified2DFloatFloatGrad:
260   case NVPTXISD::TexUnified2DS32S32:
261   case NVPTXISD::TexUnified2DS32Float:
262   case NVPTXISD::TexUnified2DS32FloatLevel:
263   case NVPTXISD::TexUnified2DS32FloatGrad:
264   case NVPTXISD::TexUnified2DU32S32:
265   case NVPTXISD::TexUnified2DU32Float:
266   case NVPTXISD::TexUnified2DU32FloatLevel:
267   case NVPTXISD::TexUnified2DU32FloatGrad:
268   case NVPTXISD::TexUnified2DArrayFloatS32:
269   case NVPTXISD::TexUnified2DArrayFloatFloat:
270   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
271   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
272   case NVPTXISD::TexUnified2DArrayS32S32:
273   case NVPTXISD::TexUnified2DArrayS32Float:
274   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
275   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
276   case NVPTXISD::TexUnified2DArrayU32S32:
277   case NVPTXISD::TexUnified2DArrayU32Float:
278   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
279   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
280   case NVPTXISD::TexUnified3DFloatS32:
281   case NVPTXISD::TexUnified3DFloatFloat:
282   case NVPTXISD::TexUnified3DFloatFloatLevel:
283   case NVPTXISD::TexUnified3DFloatFloatGrad:
284   case NVPTXISD::TexUnified3DS32S32:
285   case NVPTXISD::TexUnified3DS32Float:
286   case NVPTXISD::TexUnified3DS32FloatLevel:
287   case NVPTXISD::TexUnified3DS32FloatGrad:
288   case NVPTXISD::TexUnified3DU32S32:
289   case NVPTXISD::TexUnified3DU32Float:
290   case NVPTXISD::TexUnified3DU32FloatLevel:
291   case NVPTXISD::TexUnified3DU32FloatGrad:
292   case NVPTXISD::TexUnifiedCubeFloatFloat:
293   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
294   case NVPTXISD::TexUnifiedCubeS32Float:
295   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
296   case NVPTXISD::TexUnifiedCubeU32Float:
297   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
298   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
299   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
300   case NVPTXISD::TexUnifiedCubeArrayS32Float:
301   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
302   case NVPTXISD::TexUnifiedCubeArrayU32Float:
303   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
304   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
305   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
306   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
307   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
308   case NVPTXISD::Tld4UnifiedR2DS64Float:
309   case NVPTXISD::Tld4UnifiedG2DS64Float:
310   case NVPTXISD::Tld4UnifiedB2DS64Float:
311   case NVPTXISD::Tld4UnifiedA2DS64Float:
312   case NVPTXISD::Tld4UnifiedR2DU64Float:
313   case NVPTXISD::Tld4UnifiedG2DU64Float:
314   case NVPTXISD::Tld4UnifiedB2DU64Float:
315   case NVPTXISD::Tld4UnifiedA2DU64Float:
316     if (tryTextureIntrinsic(N))
317       return;
318     break;
319   case NVPTXISD::Suld1DI8Clamp:
320   case NVPTXISD::Suld1DI16Clamp:
321   case NVPTXISD::Suld1DI32Clamp:
322   case NVPTXISD::Suld1DI64Clamp:
323   case NVPTXISD::Suld1DV2I8Clamp:
324   case NVPTXISD::Suld1DV2I16Clamp:
325   case NVPTXISD::Suld1DV2I32Clamp:
326   case NVPTXISD::Suld1DV2I64Clamp:
327   case NVPTXISD::Suld1DV4I8Clamp:
328   case NVPTXISD::Suld1DV4I16Clamp:
329   case NVPTXISD::Suld1DV4I32Clamp:
330   case NVPTXISD::Suld1DArrayI8Clamp:
331   case NVPTXISD::Suld1DArrayI16Clamp:
332   case NVPTXISD::Suld1DArrayI32Clamp:
333   case NVPTXISD::Suld1DArrayI64Clamp:
334   case NVPTXISD::Suld1DArrayV2I8Clamp:
335   case NVPTXISD::Suld1DArrayV2I16Clamp:
336   case NVPTXISD::Suld1DArrayV2I32Clamp:
337   case NVPTXISD::Suld1DArrayV2I64Clamp:
338   case NVPTXISD::Suld1DArrayV4I8Clamp:
339   case NVPTXISD::Suld1DArrayV4I16Clamp:
340   case NVPTXISD::Suld1DArrayV4I32Clamp:
341   case NVPTXISD::Suld2DI8Clamp:
342   case NVPTXISD::Suld2DI16Clamp:
343   case NVPTXISD::Suld2DI32Clamp:
344   case NVPTXISD::Suld2DI64Clamp:
345   case NVPTXISD::Suld2DV2I8Clamp:
346   case NVPTXISD::Suld2DV2I16Clamp:
347   case NVPTXISD::Suld2DV2I32Clamp:
348   case NVPTXISD::Suld2DV2I64Clamp:
349   case NVPTXISD::Suld2DV4I8Clamp:
350   case NVPTXISD::Suld2DV4I16Clamp:
351   case NVPTXISD::Suld2DV4I32Clamp:
352   case NVPTXISD::Suld2DArrayI8Clamp:
353   case NVPTXISD::Suld2DArrayI16Clamp:
354   case NVPTXISD::Suld2DArrayI32Clamp:
355   case NVPTXISD::Suld2DArrayI64Clamp:
356   case NVPTXISD::Suld2DArrayV2I8Clamp:
357   case NVPTXISD::Suld2DArrayV2I16Clamp:
358   case NVPTXISD::Suld2DArrayV2I32Clamp:
359   case NVPTXISD::Suld2DArrayV2I64Clamp:
360   case NVPTXISD::Suld2DArrayV4I8Clamp:
361   case NVPTXISD::Suld2DArrayV4I16Clamp:
362   case NVPTXISD::Suld2DArrayV4I32Clamp:
363   case NVPTXISD::Suld3DI8Clamp:
364   case NVPTXISD::Suld3DI16Clamp:
365   case NVPTXISD::Suld3DI32Clamp:
366   case NVPTXISD::Suld3DI64Clamp:
367   case NVPTXISD::Suld3DV2I8Clamp:
368   case NVPTXISD::Suld3DV2I16Clamp:
369   case NVPTXISD::Suld3DV2I32Clamp:
370   case NVPTXISD::Suld3DV2I64Clamp:
371   case NVPTXISD::Suld3DV4I8Clamp:
372   case NVPTXISD::Suld3DV4I16Clamp:
373   case NVPTXISD::Suld3DV4I32Clamp:
374   case NVPTXISD::Suld1DI8Trap:
375   case NVPTXISD::Suld1DI16Trap:
376   case NVPTXISD::Suld1DI32Trap:
377   case NVPTXISD::Suld1DI64Trap:
378   case NVPTXISD::Suld1DV2I8Trap:
379   case NVPTXISD::Suld1DV2I16Trap:
380   case NVPTXISD::Suld1DV2I32Trap:
381   case NVPTXISD::Suld1DV2I64Trap:
382   case NVPTXISD::Suld1DV4I8Trap:
383   case NVPTXISD::Suld1DV4I16Trap:
384   case NVPTXISD::Suld1DV4I32Trap:
385   case NVPTXISD::Suld1DArrayI8Trap:
386   case NVPTXISD::Suld1DArrayI16Trap:
387   case NVPTXISD::Suld1DArrayI32Trap:
388   case NVPTXISD::Suld1DArrayI64Trap:
389   case NVPTXISD::Suld1DArrayV2I8Trap:
390   case NVPTXISD::Suld1DArrayV2I16Trap:
391   case NVPTXISD::Suld1DArrayV2I32Trap:
392   case NVPTXISD::Suld1DArrayV2I64Trap:
393   case NVPTXISD::Suld1DArrayV4I8Trap:
394   case NVPTXISD::Suld1DArrayV4I16Trap:
395   case NVPTXISD::Suld1DArrayV4I32Trap:
396   case NVPTXISD::Suld2DI8Trap:
397   case NVPTXISD::Suld2DI16Trap:
398   case NVPTXISD::Suld2DI32Trap:
399   case NVPTXISD::Suld2DI64Trap:
400   case NVPTXISD::Suld2DV2I8Trap:
401   case NVPTXISD::Suld2DV2I16Trap:
402   case NVPTXISD::Suld2DV2I32Trap:
403   case NVPTXISD::Suld2DV2I64Trap:
404   case NVPTXISD::Suld2DV4I8Trap:
405   case NVPTXISD::Suld2DV4I16Trap:
406   case NVPTXISD::Suld2DV4I32Trap:
407   case NVPTXISD::Suld2DArrayI8Trap:
408   case NVPTXISD::Suld2DArrayI16Trap:
409   case NVPTXISD::Suld2DArrayI32Trap:
410   case NVPTXISD::Suld2DArrayI64Trap:
411   case NVPTXISD::Suld2DArrayV2I8Trap:
412   case NVPTXISD::Suld2DArrayV2I16Trap:
413   case NVPTXISD::Suld2DArrayV2I32Trap:
414   case NVPTXISD::Suld2DArrayV2I64Trap:
415   case NVPTXISD::Suld2DArrayV4I8Trap:
416   case NVPTXISD::Suld2DArrayV4I16Trap:
417   case NVPTXISD::Suld2DArrayV4I32Trap:
418   case NVPTXISD::Suld3DI8Trap:
419   case NVPTXISD::Suld3DI16Trap:
420   case NVPTXISD::Suld3DI32Trap:
421   case NVPTXISD::Suld3DI64Trap:
422   case NVPTXISD::Suld3DV2I8Trap:
423   case NVPTXISD::Suld3DV2I16Trap:
424   case NVPTXISD::Suld3DV2I32Trap:
425   case NVPTXISD::Suld3DV2I64Trap:
426   case NVPTXISD::Suld3DV4I8Trap:
427   case NVPTXISD::Suld3DV4I16Trap:
428   case NVPTXISD::Suld3DV4I32Trap:
429   case NVPTXISD::Suld1DI8Zero:
430   case NVPTXISD::Suld1DI16Zero:
431   case NVPTXISD::Suld1DI32Zero:
432   case NVPTXISD::Suld1DI64Zero:
433   case NVPTXISD::Suld1DV2I8Zero:
434   case NVPTXISD::Suld1DV2I16Zero:
435   case NVPTXISD::Suld1DV2I32Zero:
436   case NVPTXISD::Suld1DV2I64Zero:
437   case NVPTXISD::Suld1DV4I8Zero:
438   case NVPTXISD::Suld1DV4I16Zero:
439   case NVPTXISD::Suld1DV4I32Zero:
440   case NVPTXISD::Suld1DArrayI8Zero:
441   case NVPTXISD::Suld1DArrayI16Zero:
442   case NVPTXISD::Suld1DArrayI32Zero:
443   case NVPTXISD::Suld1DArrayI64Zero:
444   case NVPTXISD::Suld1DArrayV2I8Zero:
445   case NVPTXISD::Suld1DArrayV2I16Zero:
446   case NVPTXISD::Suld1DArrayV2I32Zero:
447   case NVPTXISD::Suld1DArrayV2I64Zero:
448   case NVPTXISD::Suld1DArrayV4I8Zero:
449   case NVPTXISD::Suld1DArrayV4I16Zero:
450   case NVPTXISD::Suld1DArrayV4I32Zero:
451   case NVPTXISD::Suld2DI8Zero:
452   case NVPTXISD::Suld2DI16Zero:
453   case NVPTXISD::Suld2DI32Zero:
454   case NVPTXISD::Suld2DI64Zero:
455   case NVPTXISD::Suld2DV2I8Zero:
456   case NVPTXISD::Suld2DV2I16Zero:
457   case NVPTXISD::Suld2DV2I32Zero:
458   case NVPTXISD::Suld2DV2I64Zero:
459   case NVPTXISD::Suld2DV4I8Zero:
460   case NVPTXISD::Suld2DV4I16Zero:
461   case NVPTXISD::Suld2DV4I32Zero:
462   case NVPTXISD::Suld2DArrayI8Zero:
463   case NVPTXISD::Suld2DArrayI16Zero:
464   case NVPTXISD::Suld2DArrayI32Zero:
465   case NVPTXISD::Suld2DArrayI64Zero:
466   case NVPTXISD::Suld2DArrayV2I8Zero:
467   case NVPTXISD::Suld2DArrayV2I16Zero:
468   case NVPTXISD::Suld2DArrayV2I32Zero:
469   case NVPTXISD::Suld2DArrayV2I64Zero:
470   case NVPTXISD::Suld2DArrayV4I8Zero:
471   case NVPTXISD::Suld2DArrayV4I16Zero:
472   case NVPTXISD::Suld2DArrayV4I32Zero:
473   case NVPTXISD::Suld3DI8Zero:
474   case NVPTXISD::Suld3DI16Zero:
475   case NVPTXISD::Suld3DI32Zero:
476   case NVPTXISD::Suld3DI64Zero:
477   case NVPTXISD::Suld3DV2I8Zero:
478   case NVPTXISD::Suld3DV2I16Zero:
479   case NVPTXISD::Suld3DV2I32Zero:
480   case NVPTXISD::Suld3DV2I64Zero:
481   case NVPTXISD::Suld3DV4I8Zero:
482   case NVPTXISD::Suld3DV4I16Zero:
483   case NVPTXISD::Suld3DV4I32Zero:
484     if (trySurfaceIntrinsic(N))
485       return;
486     break;
487   case ISD::AND:
488   case ISD::SRA:
489   case ISD::SRL:
490     // Try to select BFE
491     if (tryBFE(N))
492       return;
493     break;
494   case ISD::ADDRSPACECAST:
495     SelectAddrSpaceCast(N);
496     return;
497   case ISD::ConstantFP:
498     if (tryConstantFP16(N))
499       return;
500     break;
501   default:
502     break;
503   }
504   SelectCode(N);
505 }
506 
507 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
508   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
509   switch (IID) {
510   default:
511     return false;
512   case Intrinsic::nvvm_ldg_global_f:
513   case Intrinsic::nvvm_ldg_global_i:
514   case Intrinsic::nvvm_ldg_global_p:
515   case Intrinsic::nvvm_ldu_global_f:
516   case Intrinsic::nvvm_ldu_global_i:
517   case Intrinsic::nvvm_ldu_global_p:
518     return tryLDGLDU(N);
519   }
520 }
521 
522 // There's no way to specify FP16 immediates in .f16 ops, so we have to
523 // load them into an .f16 register first.
524 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
525   if (N->getValueType(0) != MVT::f16)
526     return false;
527   SDValue Val = CurDAG->getTargetConstantFP(
528       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
529   SDNode *LoadConstF16 =
530       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
531   ReplaceNode(N, LoadConstF16);
532   return true;
533 }
534 
535 // Map ISD:CONDCODE value to appropriate CmpMode expected by
536 // NVPTXInstPrinter::printCmpMode()
537 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
538   using NVPTX::PTXCmpMode::CmpMode;
539   unsigned PTXCmpMode = [](ISD::CondCode CC) {
540     switch (CC) {
541     default:
542       llvm_unreachable("Unexpected condition code.");
543     case ISD::SETOEQ:
544       return CmpMode::EQ;
545     case ISD::SETOGT:
546       return CmpMode::GT;
547     case ISD::SETOGE:
548       return CmpMode::GE;
549     case ISD::SETOLT:
550       return CmpMode::LT;
551     case ISD::SETOLE:
552       return CmpMode::LE;
553     case ISD::SETONE:
554       return CmpMode::NE;
555     case ISD::SETO:
556       return CmpMode::NUM;
557     case ISD::SETUO:
558       return CmpMode::NotANumber;
559     case ISD::SETUEQ:
560       return CmpMode::EQU;
561     case ISD::SETUGT:
562       return CmpMode::GTU;
563     case ISD::SETUGE:
564       return CmpMode::GEU;
565     case ISD::SETULT:
566       return CmpMode::LTU;
567     case ISD::SETULE:
568       return CmpMode::LEU;
569     case ISD::SETUNE:
570       return CmpMode::NEU;
571     case ISD::SETEQ:
572       return CmpMode::EQ;
573     case ISD::SETGT:
574       return CmpMode::GT;
575     case ISD::SETGE:
576       return CmpMode::GE;
577     case ISD::SETLT:
578       return CmpMode::LT;
579     case ISD::SETLE:
580       return CmpMode::LE;
581     case ISD::SETNE:
582       return CmpMode::NE;
583     }
584   }(CondCode.get());
585 
586   if (FTZ)
587     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
588 
589   return PTXCmpMode;
590 }
591 
592 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
593   unsigned PTXCmpMode =
594       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
595   SDLoc DL(N);
596   SDNode *SetP = CurDAG->getMachineNode(
597       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
598       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
599   ReplaceNode(N, SetP);
600   return true;
601 }
602 
603 // Find all instances of extract_vector_elt that use this v2f16 vector
604 // and coalesce them into a scattering move instruction.
605 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
606   SDValue Vector = N->getOperand(0);
607 
608   // We only care about f16x2 as it's the only real vector type we
609   // need to deal with.
610   if (Vector.getSimpleValueType() != MVT::v2f16)
611     return false;
612 
613   // Find and record all uses of this vector that extract element 0 or 1.
614   SmallVector<SDNode *, 4> E0, E1;
615   for (auto U : Vector.getNode()->uses()) {
616     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
617       continue;
618     if (U->getOperand(0) != Vector)
619       continue;
620     if (const ConstantSDNode *IdxConst =
621             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
622       if (IdxConst->getZExtValue() == 0)
623         E0.push_back(U);
624       else if (IdxConst->getZExtValue() == 1)
625         E1.push_back(U);
626       else
627         llvm_unreachable("Invalid vector index.");
628     }
629   }
630 
631   // There's no point scattering f16x2 if we only ever access one
632   // element of it.
633   if (E0.empty() || E1.empty())
634     return false;
635 
636   unsigned Op = NVPTX::SplitF16x2;
637   // If the vector has been BITCAST'ed from i32, we can use original
638   // value directly and avoid register-to-register move.
639   SDValue Source = Vector;
640   if (Vector->getOpcode() == ISD::BITCAST) {
641     Op = NVPTX::SplitI32toF16x2;
642     Source = Vector->getOperand(0);
643   }
644   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645   // into f16,f16 SplitF16x2(V)
646   SDNode *ScatterOp =
647       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
648   for (auto *Node : E0)
649     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
650   for (auto *Node : E1)
651     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
652 
653   return true;
654 }
655 
656 static unsigned int getCodeAddrSpace(MemSDNode *N) {
657   const Value *Src = N->getMemOperand()->getValue();
658 
659   if (!Src)
660     return NVPTX::PTXLdStInstCode::GENERIC;
661 
662   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
663     switch (PT->getAddressSpace()) {
664     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
665     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
666     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
667     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
668     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
669     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
670     default: break;
671     }
672   }
673   return NVPTX::PTXLdStInstCode::GENERIC;
674 }
675 
676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
677                           unsigned CodeAddrSpace, MachineFunction *F) {
678   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
679   // space.
680   //
681   // We have two ways of identifying invariant loads: Loads may be explicitly
682   // marked as invariant, or we may infer them to be invariant.
683   //
684   // We currently infer invariance for loads from
685   //  - constant global variables, and
686   //  - kernel function pointer params that are noalias (i.e. __restrict) and
687   //    never written to.
688   //
689   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
690   // not during the SelectionDAG phase).
691   //
692   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
693   // explicitly invariant loads because these are how clang tells us to use ldg
694   // when the user uses a builtin.
695   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
696     return false;
697 
698   if (N->isInvariant())
699     return true;
700 
701   bool IsKernelFn = isKernelFunction(F->getFunction());
702 
703   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
704   // because the former looks through phi nodes while the latter does not. We
705   // need to look through phi nodes to handle pointer induction variables.
706   SmallVector<const Value *, 8> Objs;
707   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
708 
709   return all_of(Objs, [&](const Value *V) {
710     if (auto *A = dyn_cast<const Argument>(V))
711       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712     if (auto *GV = dyn_cast<const GlobalVariable>(V))
713       return GV->isConstant();
714     return false;
715   });
716 }
717 
718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720   switch (IID) {
721   default:
722     return false;
723   case Intrinsic::nvvm_texsurf_handle_internal:
724     SelectTexSurfHandle(N);
725     return true;
726   }
727 }
728 
729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730   // Op 0 is the intrinsic ID
731   SDValue Wrapper = N->getOperand(1);
732   SDValue GlobalVal = Wrapper.getOperand(0);
733   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734                                         MVT::i64, GlobalVal));
735 }
736 
737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738   SDValue Src = N->getOperand(0);
739   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741   unsigned DstAddrSpace = CastN->getDestAddressSpace();
742   assert(SrcAddrSpace != DstAddrSpace &&
743          "addrspacecast must be between different address spaces");
744 
745   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746     // Specific to generic
747     unsigned Opc;
748     switch (SrcAddrSpace) {
749     default: report_fatal_error("Bad address space in addrspacecast");
750     case ADDRESS_SPACE_GLOBAL:
751       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752       break;
753     case ADDRESS_SPACE_SHARED:
754       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755                                                : NVPTX::cvta_shared_yes_64)
756                          : NVPTX::cvta_shared_yes;
757       break;
758     case ADDRESS_SPACE_CONST:
759       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760                                                : NVPTX::cvta_const_yes_64)
761                          : NVPTX::cvta_const_yes;
762       break;
763     case ADDRESS_SPACE_LOCAL:
764       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765                                                : NVPTX::cvta_local_yes_64)
766                          : NVPTX::cvta_local_yes;
767       break;
768     }
769     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
770                                           Src));
771     return;
772   } else {
773     // Generic to specific
774     if (SrcAddrSpace != 0)
775       report_fatal_error("Cannot cast between two non-generic address spaces");
776     unsigned Opc;
777     switch (DstAddrSpace) {
778     default: report_fatal_error("Bad address space in addrspacecast");
779     case ADDRESS_SPACE_GLOBAL:
780       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781                          : NVPTX::cvta_to_global_yes;
782       break;
783     case ADDRESS_SPACE_SHARED:
784       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785                                                 : NVPTX::cvta_to_shared_yes_64)
786                          : NVPTX::cvta_to_shared_yes;
787       break;
788     case ADDRESS_SPACE_CONST:
789       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790                                              : NVPTX::cvta_to_const_yes_64)
791                          : NVPTX::cvta_to_const_yes;
792       break;
793     case ADDRESS_SPACE_LOCAL:
794       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795                                                : NVPTX::cvta_to_local_yes_64)
796                          : NVPTX::cvta_to_local_yes;
797       break;
798     case ADDRESS_SPACE_PARAM:
799       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800                          : NVPTX::nvvm_ptr_gen_to_param;
801       break;
802     }
803     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
804                                           Src));
805     return;
806   }
807 }
808 
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
811 static Optional<unsigned> pickOpcodeForVT(
812     MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
813     unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
814     unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
815   switch (VT) {
816   case MVT::i1:
817   case MVT::i8:
818     return Opcode_i8;
819   case MVT::i16:
820     return Opcode_i16;
821   case MVT::i32:
822     return Opcode_i32;
823   case MVT::i64:
824     return Opcode_i64;
825   case MVT::f16:
826     return Opcode_f16;
827   case MVT::v2f16:
828     return Opcode_f16x2;
829   case MVT::f32:
830     return Opcode_f32;
831   case MVT::f64:
832     return Opcode_f64;
833   default:
834     return None;
835   }
836 }
837 
838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
839   SDLoc dl(N);
840   MemSDNode *LD = cast<MemSDNode>(N);
841   assert(LD->readMem() && "Expected load");
842   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
843   EVT LoadedVT = LD->getMemoryVT();
844   SDNode *NVPTXLD = nullptr;
845 
846   // do not support pre/post inc/dec
847   if (PlainLoad && PlainLoad->isIndexed())
848     return false;
849 
850   if (!LoadedVT.isSimple())
851     return false;
852 
853   AtomicOrdering Ordering = LD->getSuccessOrdering();
854   // In order to lower atomic loads with stronger guarantees we would need to
855   // use load.acquire or insert fences. However these features were only added
856   // with PTX ISA 6.0 / sm_70.
857   // TODO: Check if we can actually use the new instructions and implement them.
858   if (isStrongerThanMonotonic(Ordering))
859     return false;
860 
861   // Address Space Setting
862   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
863   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
864     return tryLDGLDU(N);
865   }
866 
867   unsigned int PointerSize =
868       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
869 
870   // Volatile Setting
871   // - .volatile is only available for .global and .shared
872   // - .volatile has the same memory synchronization semantics as .relaxed.sys
873   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
874   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
875       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
876       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
877     isVolatile = false;
878 
879   // Type Setting: fromType + fromTypeWidth
880   //
881   // Sign   : ISD::SEXTLOAD
882   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
883   //          type is integer
884   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885   MVT SimpleVT = LoadedVT.getSimpleVT();
886   MVT ScalarVT = SimpleVT.getScalarType();
887   // Read at least 8 bits (predicates are stored as 8-bit values)
888   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
889   unsigned int fromType;
890 
891   // Vector Setting
892   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
893   if (SimpleVT.isVector()) {
894     assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
895     // v2f16 is loaded using ld.b32
896     fromTypeWidth = 32;
897   }
898 
899   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
900     fromType = NVPTX::PTXLdStInstCode::Signed;
901   else if (ScalarVT.isFloatingPoint())
902     // f16 uses .b16 as its storage type.
903     fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
904                                              : NVPTX::PTXLdStInstCode::Float;
905   else
906     fromType = NVPTX::PTXLdStInstCode::Unsigned;
907 
908   // Create the machine instruction DAG
909   SDValue Chain = N->getOperand(0);
910   SDValue N1 = N->getOperand(1);
911   SDValue Addr;
912   SDValue Offset, Base;
913   Optional<unsigned> Opcode;
914   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
915 
916   if (SelectDirectAddr(N1, Addr)) {
917     Opcode = pickOpcodeForVT(
918         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
919         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
920         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
921     if (!Opcode)
922       return false;
923     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
924                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
925                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
926     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
927                                      MVT::Other, Ops);
928   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
929                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
930     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
931                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
932                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
933                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
934     if (!Opcode)
935       return false;
936     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
937                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
938                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
939     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
940                                      MVT::Other, Ops);
941   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
942                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
943     if (PointerSize == 64)
944       Opcode = pickOpcodeForVT(
945           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
946           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
947           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
948     else
949       Opcode = pickOpcodeForVT(
950           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
951           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
952           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
953     if (!Opcode)
954       return false;
955     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
956                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
957                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
958     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
959                                      MVT::Other, Ops);
960   } else {
961     if (PointerSize == 64)
962       Opcode = pickOpcodeForVT(
963           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
964           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
965           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
966           NVPTX::LD_f64_areg_64);
967     else
968       Opcode = pickOpcodeForVT(
969           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
970           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
971           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
972     if (!Opcode)
973       return false;
974     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976                       getI32Imm(fromTypeWidth, dl), N1, Chain };
977     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
978                                      MVT::Other, Ops);
979   }
980 
981   if (!NVPTXLD)
982     return false;
983 
984   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
985   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
986 
987   ReplaceNode(N, NVPTXLD);
988   return true;
989 }
990 
991 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
992 
993   SDValue Chain = N->getOperand(0);
994   SDValue Op1 = N->getOperand(1);
995   SDValue Addr, Offset, Base;
996   Optional<unsigned> Opcode;
997   SDLoc DL(N);
998   SDNode *LD;
999   MemSDNode *MemSD = cast<MemSDNode>(N);
1000   EVT LoadedVT = MemSD->getMemoryVT();
1001 
1002   if (!LoadedVT.isSimple())
1003     return false;
1004 
1005   // Address Space Setting
1006   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1007   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1008     return tryLDGLDU(N);
1009   }
1010 
1011   unsigned int PointerSize =
1012       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1013 
1014   // Volatile Setting
1015   // - .volatile is only availalble for .global and .shared
1016   bool IsVolatile = MemSD->isVolatile();
1017   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1018       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1019       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1020     IsVolatile = false;
1021 
1022   // Vector Setting
1023   MVT SimpleVT = LoadedVT.getSimpleVT();
1024 
1025   // Type Setting: fromType + fromTypeWidth
1026   //
1027   // Sign   : ISD::SEXTLOAD
1028   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1029   //          type is integer
1030   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1031   MVT ScalarVT = SimpleVT.getScalarType();
1032   // Read at least 8 bits (predicates are stored as 8-bit values)
1033   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1034   unsigned int FromType;
1035   // The last operand holds the original LoadSDNode::getExtensionType() value
1036   unsigned ExtensionType = cast<ConstantSDNode>(
1037       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1038   if (ExtensionType == ISD::SEXTLOAD)
1039     FromType = NVPTX::PTXLdStInstCode::Signed;
1040   else if (ScalarVT.isFloatingPoint())
1041     FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1042                                              : NVPTX::PTXLdStInstCode::Float;
1043   else
1044     FromType = NVPTX::PTXLdStInstCode::Unsigned;
1045 
1046   unsigned VecType;
1047 
1048   switch (N->getOpcode()) {
1049   case NVPTXISD::LoadV2:
1050     VecType = NVPTX::PTXLdStInstCode::V2;
1051     break;
1052   case NVPTXISD::LoadV4:
1053     VecType = NVPTX::PTXLdStInstCode::V4;
1054     break;
1055   default:
1056     return false;
1057   }
1058 
1059   EVT EltVT = N->getValueType(0);
1060 
1061   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1062   // instruction. Instead, we split the vector into v2f16 chunks and
1063   // load them with ld.v4.b32.
1064   if (EltVT == MVT::v2f16) {
1065     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1066     EltVT = MVT::i32;
1067     FromType = NVPTX::PTXLdStInstCode::Untyped;
1068     FromTypeWidth = 32;
1069   }
1070 
1071   if (SelectDirectAddr(Op1, Addr)) {
1072     switch (N->getOpcode()) {
1073     default:
1074       return false;
1075     case NVPTXISD::LoadV2:
1076       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1077                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1078                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1079                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1080                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1081       break;
1082     case NVPTXISD::LoadV4:
1083       Opcode =
1084           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1085                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1086                           NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1087                           NVPTX::LDV_f32_v4_avar, None);
1088       break;
1089     }
1090     if (!Opcode)
1091       return false;
1092     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1093                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1094                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1095     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1096   } else if (PointerSize == 64
1097                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1098                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1099     switch (N->getOpcode()) {
1100     default:
1101       return false;
1102     case NVPTXISD::LoadV2:
1103       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1104                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1105                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1106                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1107                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1108       break;
1109     case NVPTXISD::LoadV4:
1110       Opcode =
1111           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1112                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1113                           NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1114                           NVPTX::LDV_f32_v4_asi, None);
1115       break;
1116     }
1117     if (!Opcode)
1118       return false;
1119     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1120                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1121                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1122     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1123   } else if (PointerSize == 64
1124                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1125                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1126     if (PointerSize == 64) {
1127       switch (N->getOpcode()) {
1128       default:
1129         return false;
1130       case NVPTXISD::LoadV2:
1131         Opcode = pickOpcodeForVT(
1132             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1133             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1134             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1135             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1136             NVPTX::LDV_f64_v2_ari_64);
1137         break;
1138       case NVPTXISD::LoadV4:
1139         Opcode = pickOpcodeForVT(
1140             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1141             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1142             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1143             NVPTX::LDV_f32_v4_ari_64, None);
1144         break;
1145       }
1146     } else {
1147       switch (N->getOpcode()) {
1148       default:
1149         return false;
1150       case NVPTXISD::LoadV2:
1151         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1152                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1153                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1154                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1155                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1156         break;
1157       case NVPTXISD::LoadV4:
1158         Opcode =
1159             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1160                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1161                             NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1162                             NVPTX::LDV_f32_v4_ari, None);
1163         break;
1164       }
1165     }
1166     if (!Opcode)
1167       return false;
1168     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1169                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1170                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1171 
1172     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1173   } else {
1174     if (PointerSize == 64) {
1175       switch (N->getOpcode()) {
1176       default:
1177         return false;
1178       case NVPTXISD::LoadV2:
1179         Opcode = pickOpcodeForVT(
1180             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1181             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1182             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1183             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1184             NVPTX::LDV_f64_v2_areg_64);
1185         break;
1186       case NVPTXISD::LoadV4:
1187         Opcode = pickOpcodeForVT(
1188             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1189             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1190             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1191             NVPTX::LDV_f32_v4_areg_64, None);
1192         break;
1193       }
1194     } else {
1195       switch (N->getOpcode()) {
1196       default:
1197         return false;
1198       case NVPTXISD::LoadV2:
1199         Opcode =
1200             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1201                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1202                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1203                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1204                             NVPTX::LDV_f64_v2_areg);
1205         break;
1206       case NVPTXISD::LoadV4:
1207         Opcode = pickOpcodeForVT(
1208             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1209             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1210             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1211             NVPTX::LDV_f32_v4_areg, None);
1212         break;
1213       }
1214     }
1215     if (!Opcode)
1216       return false;
1217     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1218                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1219                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1220     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1221   }
1222 
1223   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1224   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1225 
1226   ReplaceNode(N, LD);
1227   return true;
1228 }
1229 
1230 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1231 
1232   SDValue Chain = N->getOperand(0);
1233   SDValue Op1;
1234   MemSDNode *Mem;
1235   bool IsLDG = true;
1236 
1237   // If this is an LDG intrinsic, the address is the third operand. If its an
1238   // LDG/LDU SD node (from custom vector handling), then its the second operand
1239   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1240     Op1 = N->getOperand(2);
1241     Mem = cast<MemIntrinsicSDNode>(N);
1242     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1243     switch (IID) {
1244     default:
1245       return false;
1246     case Intrinsic::nvvm_ldg_global_f:
1247     case Intrinsic::nvvm_ldg_global_i:
1248     case Intrinsic::nvvm_ldg_global_p:
1249       IsLDG = true;
1250       break;
1251     case Intrinsic::nvvm_ldu_global_f:
1252     case Intrinsic::nvvm_ldu_global_i:
1253     case Intrinsic::nvvm_ldu_global_p:
1254       IsLDG = false;
1255       break;
1256     }
1257   } else {
1258     Op1 = N->getOperand(1);
1259     Mem = cast<MemSDNode>(N);
1260   }
1261 
1262   Optional<unsigned> Opcode;
1263   SDLoc DL(N);
1264   SDNode *LD;
1265   SDValue Base, Offset, Addr;
1266 
1267   EVT EltVT = Mem->getMemoryVT();
1268   unsigned NumElts = 1;
1269   if (EltVT.isVector()) {
1270     NumElts = EltVT.getVectorNumElements();
1271     EltVT = EltVT.getVectorElementType();
1272     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1273     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1274       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1275       EltVT = MVT::v2f16;
1276       NumElts /= 2;
1277     }
1278   }
1279 
1280   // Build the "promoted" result VTList for the load. If we are really loading
1281   // i8s, then the return type will be promoted to i16 since we do not expose
1282   // 8-bit registers in NVPTX.
1283   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1284   SmallVector<EVT, 5> InstVTs;
1285   for (unsigned i = 0; i != NumElts; ++i) {
1286     InstVTs.push_back(NodeVT);
1287   }
1288   InstVTs.push_back(MVT::Other);
1289   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1290 
1291   if (SelectDirectAddr(Op1, Addr)) {
1292     switch (N->getOpcode()) {
1293     default:
1294       return false;
1295     case ISD::LOAD:
1296     case ISD::INTRINSIC_W_CHAIN:
1297       if (IsLDG)
1298         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1299                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1300                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1301                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1302                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1303                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1304                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1305                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1306                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1307       else
1308         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1309                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1310                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1311                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1312                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1313                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1314                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1315                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1316                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1317       break;
1318     case NVPTXISD::LoadV2:
1319     case NVPTXISD::LDGV2:
1320       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1321                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1322                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1323                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1324                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1325                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1326                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1327                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1328                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1329       break;
1330     case NVPTXISD::LDUV2:
1331       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1332                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1333                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1334                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1335                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1336                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1337                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1338                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1339                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1340       break;
1341     case NVPTXISD::LoadV4:
1342     case NVPTXISD::LDGV4:
1343       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1344                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1345                                NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1346                                NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1347                                NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1348                                NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1349                                NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1350       break;
1351     case NVPTXISD::LDUV4:
1352       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1353                                NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1354                                NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1355                                NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1356                                NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1357                                NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1358                                NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1359       break;
1360     }
1361     if (!Opcode)
1362       return false;
1363     SDValue Ops[] = { Addr, Chain };
1364     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1365   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1366                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1367     if (TM.is64Bit()) {
1368       switch (N->getOpcode()) {
1369       default:
1370         return false;
1371       case ISD::LOAD:
1372       case ISD::INTRINSIC_W_CHAIN:
1373         if (IsLDG)
1374           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1375                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1376                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1377                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1378                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1379                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1380                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1381                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1382                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1383         else
1384           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1385                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1386                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1387                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1388                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1389                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1390                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1391                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1392                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1393         break;
1394       case NVPTXISD::LoadV2:
1395       case NVPTXISD::LDGV2:
1396         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1397                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1398                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1399                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1400                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1401                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1402                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1403                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1404                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1405         break;
1406       case NVPTXISD::LDUV2:
1407         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1408                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1409                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1410                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1411                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1412                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1413                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1414                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1415                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1416         break;
1417       case NVPTXISD::LoadV4:
1418       case NVPTXISD::LDGV4:
1419         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1420                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1421                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1422                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1423                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1424                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1425                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1426         break;
1427       case NVPTXISD::LDUV4:
1428         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1429                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1430                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1431                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1432                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1433                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1434                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1435         break;
1436       }
1437     } else {
1438       switch (N->getOpcode()) {
1439       default:
1440         return false;
1441       case ISD::LOAD:
1442       case ISD::INTRINSIC_W_CHAIN:
1443         if (IsLDG)
1444           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1445                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1446                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1447                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1448                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1449                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1450                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1451                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1452                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1453         else
1454           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1455                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1456                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1457                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1458                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1459                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1460                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1461                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1462                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1463         break;
1464       case NVPTXISD::LoadV2:
1465       case NVPTXISD::LDGV2:
1466         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1467                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1468                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1469                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1470                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1471                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1472                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1473                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1474                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1475         break;
1476       case NVPTXISD::LDUV2:
1477         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1478                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1479                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1480                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1481                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1482                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1483                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1484                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1485                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1486         break;
1487       case NVPTXISD::LoadV4:
1488       case NVPTXISD::LDGV4:
1489         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1490                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1491                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1492                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1493                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1494                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1495                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1496         break;
1497       case NVPTXISD::LDUV4:
1498         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1499                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1500                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1501                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1502                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1503                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1504                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1505         break;
1506       }
1507     }
1508     if (!Opcode)
1509       return false;
1510     SDValue Ops[] = {Base, Offset, Chain};
1511     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1512   } else {
1513     if (TM.is64Bit()) {
1514       switch (N->getOpcode()) {
1515       default:
1516         return false;
1517       case ISD::LOAD:
1518       case ISD::INTRINSIC_W_CHAIN:
1519         if (IsLDG)
1520           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1521                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1522                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1523                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1524                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1525                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1526                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1527                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1528                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1529         else
1530           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1531                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1532                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1533                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1534                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1535                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1536                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1537                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1538                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1539         break;
1540       case NVPTXISD::LoadV2:
1541       case NVPTXISD::LDGV2:
1542         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1543                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1544                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1545                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1546                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1547                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1548                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1549                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1550                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1551         break;
1552       case NVPTXISD::LDUV2:
1553         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1554                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1555                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1556                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1557                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1558                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1559                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1560                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1561                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1562         break;
1563       case NVPTXISD::LoadV4:
1564       case NVPTXISD::LDGV4:
1565         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1566                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1567                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1568                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1569                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1570                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1571                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1572         break;
1573       case NVPTXISD::LDUV4:
1574         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1575                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1576                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1577                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1578                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1579                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1580                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1581         break;
1582       }
1583     } else {
1584       switch (N->getOpcode()) {
1585       default:
1586         return false;
1587       case ISD::LOAD:
1588       case ISD::INTRINSIC_W_CHAIN:
1589         if (IsLDG)
1590           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1591                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1592                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1593                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1594                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1595                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1596                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1597                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1598                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1599         else
1600           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1601                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1602                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1603                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1604                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1605                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1606                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1607                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1608                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1609         break;
1610       case NVPTXISD::LoadV2:
1611       case NVPTXISD::LDGV2:
1612         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1613                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1614                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1615                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1616                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1617                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1618                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1619                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1620                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1621         break;
1622       case NVPTXISD::LDUV2:
1623         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1624                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1625                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1626                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1627                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1628                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1629                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1630                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1631                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1632         break;
1633       case NVPTXISD::LoadV4:
1634       case NVPTXISD::LDGV4:
1635         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1636                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1637                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1638                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1639                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1640                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1641                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1642         break;
1643       case NVPTXISD::LDUV4:
1644         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1645                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1646                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1647                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1648                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1649                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1650                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1651         break;
1652       }
1653     }
1654     if (!Opcode)
1655       return false;
1656     SDValue Ops[] = { Op1, Chain };
1657     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1658   }
1659 
1660   MachineMemOperand *MemRef = Mem->getMemOperand();
1661   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1662 
1663   // For automatic generation of LDG (through SelectLoad[Vector], not the
1664   // intrinsics), we may have an extending load like:
1665   //
1666   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1667   //
1668   // In this case, the matching logic above will select a load for the original
1669   // memory type (in this case, i8) and our types will not match (the node needs
1670   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1671   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1672   // CVT instruction. Ptxas should clean up any redundancies here.
1673 
1674   EVT OrigType = N->getValueType(0);
1675   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1676 
1677   if (OrigType != EltVT && LdNode) {
1678     // We have an extending-load. The instruction we selected operates on the
1679     // smaller type, but the SDNode we are replacing has the larger type. We
1680     // need to emit a CVT to make the types match.
1681     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1682     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1683                                        EltVT.getSimpleVT(), IsSigned);
1684 
1685     // For each output value, apply the manual sign/zero-extension and make sure
1686     // all users of the load go through that CVT.
1687     for (unsigned i = 0; i != NumElts; ++i) {
1688       SDValue Res(LD, i);
1689       SDValue OrigVal(N, i);
1690 
1691       SDNode *CvtNode =
1692         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1693                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1694                                                          DL, MVT::i32));
1695       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1696     }
1697   }
1698 
1699   ReplaceNode(N, LD);
1700   return true;
1701 }
1702 
1703 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1704   SDLoc dl(N);
1705   MemSDNode *ST = cast<MemSDNode>(N);
1706   assert(ST->writeMem() && "Expected store");
1707   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1708   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1709   assert((PlainStore || AtomicStore) && "Expected store");
1710   EVT StoreVT = ST->getMemoryVT();
1711   SDNode *NVPTXST = nullptr;
1712 
1713   // do not support pre/post inc/dec
1714   if (PlainStore && PlainStore->isIndexed())
1715     return false;
1716 
1717   if (!StoreVT.isSimple())
1718     return false;
1719 
1720   AtomicOrdering Ordering = ST->getSuccessOrdering();
1721   // In order to lower atomic loads with stronger guarantees we would need to
1722   // use store.release or insert fences. However these features were only added
1723   // with PTX ISA 6.0 / sm_70.
1724   // TODO: Check if we can actually use the new instructions and implement them.
1725   if (isStrongerThanMonotonic(Ordering))
1726     return false;
1727 
1728   // Address Space Setting
1729   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1730   unsigned int PointerSize =
1731       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1732 
1733   // Volatile Setting
1734   // - .volatile is only available for .global and .shared
1735   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1736   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1737   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1738       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1739       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1740     isVolatile = false;
1741 
1742   // Vector Setting
1743   MVT SimpleVT = StoreVT.getSimpleVT();
1744   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1745 
1746   // Type Setting: toType + toTypeWidth
1747   // - for integer type, always use 'u'
1748   //
1749   MVT ScalarVT = SimpleVT.getScalarType();
1750   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1751   if (SimpleVT.isVector()) {
1752     assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1753     // v2f16 is stored using st.b32
1754     toTypeWidth = 32;
1755   }
1756 
1757   unsigned int toType;
1758   if (ScalarVT.isFloatingPoint())
1759     // f16 uses .b16 as its storage type.
1760     toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1761                                            : NVPTX::PTXLdStInstCode::Float;
1762   else
1763     toType = NVPTX::PTXLdStInstCode::Unsigned;
1764 
1765   // Create the machine instruction DAG
1766   SDValue Chain = ST->getChain();
1767   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1768   SDValue BasePtr = ST->getBasePtr();
1769   SDValue Addr;
1770   SDValue Offset, Base;
1771   Optional<unsigned> Opcode;
1772   MVT::SimpleValueType SourceVT =
1773       Value.getNode()->getSimpleValueType(0).SimpleTy;
1774 
1775   if (SelectDirectAddr(BasePtr, Addr)) {
1776     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1777                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1778                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1779                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1780     if (!Opcode)
1781       return false;
1782     SDValue Ops[] = {Value,
1783                      getI32Imm(isVolatile, dl),
1784                      getI32Imm(CodeAddrSpace, dl),
1785                      getI32Imm(vecType, dl),
1786                      getI32Imm(toType, dl),
1787                      getI32Imm(toTypeWidth, dl),
1788                      Addr,
1789                      Chain};
1790     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1791   } else if (PointerSize == 64
1792                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1793                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1794     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1795                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1796                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1797                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1798     if (!Opcode)
1799       return false;
1800     SDValue Ops[] = {Value,
1801                      getI32Imm(isVolatile, dl),
1802                      getI32Imm(CodeAddrSpace, dl),
1803                      getI32Imm(vecType, dl),
1804                      getI32Imm(toType, dl),
1805                      getI32Imm(toTypeWidth, dl),
1806                      Base,
1807                      Offset,
1808                      Chain};
1809     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1810   } else if (PointerSize == 64
1811                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1812                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1813     if (PointerSize == 64)
1814       Opcode = pickOpcodeForVT(
1815           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1816           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1817           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1818     else
1819       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1820                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1821                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1822                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1823     if (!Opcode)
1824       return false;
1825 
1826     SDValue Ops[] = {Value,
1827                      getI32Imm(isVolatile, dl),
1828                      getI32Imm(CodeAddrSpace, dl),
1829                      getI32Imm(vecType, dl),
1830                      getI32Imm(toType, dl),
1831                      getI32Imm(toTypeWidth, dl),
1832                      Base,
1833                      Offset,
1834                      Chain};
1835     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1836   } else {
1837     if (PointerSize == 64)
1838       Opcode =
1839           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1840                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1841                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1842                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1843     else
1844       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1845                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1846                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1847                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1848     if (!Opcode)
1849       return false;
1850     SDValue Ops[] = {Value,
1851                      getI32Imm(isVolatile, dl),
1852                      getI32Imm(CodeAddrSpace, dl),
1853                      getI32Imm(vecType, dl),
1854                      getI32Imm(toType, dl),
1855                      getI32Imm(toTypeWidth, dl),
1856                      BasePtr,
1857                      Chain};
1858     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1859   }
1860 
1861   if (!NVPTXST)
1862     return false;
1863 
1864   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1865   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1866   ReplaceNode(N, NVPTXST);
1867   return true;
1868 }
1869 
1870 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1871   SDValue Chain = N->getOperand(0);
1872   SDValue Op1 = N->getOperand(1);
1873   SDValue Addr, Offset, Base;
1874   Optional<unsigned> Opcode;
1875   SDLoc DL(N);
1876   SDNode *ST;
1877   EVT EltVT = Op1.getValueType();
1878   MemSDNode *MemSD = cast<MemSDNode>(N);
1879   EVT StoreVT = MemSD->getMemoryVT();
1880 
1881   // Address Space Setting
1882   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1883   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1884     report_fatal_error("Cannot store to pointer that points to constant "
1885                        "memory space");
1886   }
1887   unsigned int PointerSize =
1888       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1889 
1890   // Volatile Setting
1891   // - .volatile is only availalble for .global and .shared
1892   bool IsVolatile = MemSD->isVolatile();
1893   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1894       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1895       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1896     IsVolatile = false;
1897 
1898   // Type Setting: toType + toTypeWidth
1899   // - for integer type, always use 'u'
1900   assert(StoreVT.isSimple() && "Store value is not simple");
1901   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1902   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1903   unsigned ToType;
1904   if (ScalarVT.isFloatingPoint())
1905     ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1906                                            : NVPTX::PTXLdStInstCode::Float;
1907   else
1908     ToType = NVPTX::PTXLdStInstCode::Unsigned;
1909 
1910   SmallVector<SDValue, 12> StOps;
1911   SDValue N2;
1912   unsigned VecType;
1913 
1914   switch (N->getOpcode()) {
1915   case NVPTXISD::StoreV2:
1916     VecType = NVPTX::PTXLdStInstCode::V2;
1917     StOps.push_back(N->getOperand(1));
1918     StOps.push_back(N->getOperand(2));
1919     N2 = N->getOperand(3);
1920     break;
1921   case NVPTXISD::StoreV4:
1922     VecType = NVPTX::PTXLdStInstCode::V4;
1923     StOps.push_back(N->getOperand(1));
1924     StOps.push_back(N->getOperand(2));
1925     StOps.push_back(N->getOperand(3));
1926     StOps.push_back(N->getOperand(4));
1927     N2 = N->getOperand(5);
1928     break;
1929   default:
1930     return false;
1931   }
1932 
1933   // v8f16 is a special case. PTX doesn't have st.v8.f16
1934   // instruction. Instead, we split the vector into v2f16 chunks and
1935   // store them with st.v4.b32.
1936   if (EltVT == MVT::v2f16) {
1937     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1938     EltVT = MVT::i32;
1939     ToType = NVPTX::PTXLdStInstCode::Untyped;
1940     ToTypeWidth = 32;
1941   }
1942 
1943   StOps.push_back(getI32Imm(IsVolatile, DL));
1944   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1945   StOps.push_back(getI32Imm(VecType, DL));
1946   StOps.push_back(getI32Imm(ToType, DL));
1947   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1948 
1949   if (SelectDirectAddr(N2, Addr)) {
1950     switch (N->getOpcode()) {
1951     default:
1952       return false;
1953     case NVPTXISD::StoreV2:
1954       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1955                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1956                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1957                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1958                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1959       break;
1960     case NVPTXISD::StoreV4:
1961       Opcode =
1962           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1963                           NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1964                           NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1965                           NVPTX::STV_f32_v4_avar, None);
1966       break;
1967     }
1968     StOps.push_back(Addr);
1969   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1970                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1971     switch (N->getOpcode()) {
1972     default:
1973       return false;
1974     case NVPTXISD::StoreV2:
1975       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1976                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1977                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1978                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1979                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1980       break;
1981     case NVPTXISD::StoreV4:
1982       Opcode =
1983           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1984                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1985                           NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1986                           NVPTX::STV_f32_v4_asi, None);
1987       break;
1988     }
1989     StOps.push_back(Base);
1990     StOps.push_back(Offset);
1991   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1992                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1993     if (PointerSize == 64) {
1994       switch (N->getOpcode()) {
1995       default:
1996         return false;
1997       case NVPTXISD::StoreV2:
1998         Opcode = pickOpcodeForVT(
1999             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2000             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2001             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2002             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2003             NVPTX::STV_f64_v2_ari_64);
2004         break;
2005       case NVPTXISD::StoreV4:
2006         Opcode = pickOpcodeForVT(
2007             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2008             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2009             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2010             NVPTX::STV_f32_v4_ari_64, None);
2011         break;
2012       }
2013     } else {
2014       switch (N->getOpcode()) {
2015       default:
2016         return false;
2017       case NVPTXISD::StoreV2:
2018         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2019                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2020                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2021                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2022                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2023         break;
2024       case NVPTXISD::StoreV4:
2025         Opcode =
2026             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2027                             NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2028                             NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2029                             NVPTX::STV_f32_v4_ari, None);
2030         break;
2031       }
2032     }
2033     StOps.push_back(Base);
2034     StOps.push_back(Offset);
2035   } else {
2036     if (PointerSize == 64) {
2037       switch (N->getOpcode()) {
2038       default:
2039         return false;
2040       case NVPTXISD::StoreV2:
2041         Opcode = pickOpcodeForVT(
2042             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2043             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2044             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2045             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2046             NVPTX::STV_f64_v2_areg_64);
2047         break;
2048       case NVPTXISD::StoreV4:
2049         Opcode = pickOpcodeForVT(
2050             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2051             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2052             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2053             NVPTX::STV_f32_v4_areg_64, None);
2054         break;
2055       }
2056     } else {
2057       switch (N->getOpcode()) {
2058       default:
2059         return false;
2060       case NVPTXISD::StoreV2:
2061         Opcode =
2062             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2063                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2064                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2065                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2066                             NVPTX::STV_f64_v2_areg);
2067         break;
2068       case NVPTXISD::StoreV4:
2069         Opcode =
2070             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2071                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2072                             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2073                             NVPTX::STV_f32_v4_areg, None);
2074         break;
2075       }
2076     }
2077     StOps.push_back(N2);
2078   }
2079 
2080   if (!Opcode)
2081     return false;
2082 
2083   StOps.push_back(Chain);
2084 
2085   ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2086 
2087   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2088   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2089 
2090   ReplaceNode(N, ST);
2091   return true;
2092 }
2093 
2094 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2095   SDValue Chain = Node->getOperand(0);
2096   SDValue Offset = Node->getOperand(2);
2097   SDValue Flag = Node->getOperand(3);
2098   SDLoc DL(Node);
2099   MemSDNode *Mem = cast<MemSDNode>(Node);
2100 
2101   unsigned VecSize;
2102   switch (Node->getOpcode()) {
2103   default:
2104     return false;
2105   case NVPTXISD::LoadParam:
2106     VecSize = 1;
2107     break;
2108   case NVPTXISD::LoadParamV2:
2109     VecSize = 2;
2110     break;
2111   case NVPTXISD::LoadParamV4:
2112     VecSize = 4;
2113     break;
2114   }
2115 
2116   EVT EltVT = Node->getValueType(0);
2117   EVT MemVT = Mem->getMemoryVT();
2118 
2119   Optional<unsigned> Opcode;
2120 
2121   switch (VecSize) {
2122   default:
2123     return false;
2124   case 1:
2125     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2126                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2127                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2128                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2129                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2130     break;
2131   case 2:
2132     Opcode =
2133         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2134                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2135                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2136                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2137                         NVPTX::LoadParamMemV2F64);
2138     break;
2139   case 4:
2140     Opcode = pickOpcodeForVT(
2141         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2142         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2143         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2144         NVPTX::LoadParamMemV4F32, None);
2145     break;
2146   }
2147   if (!Opcode)
2148     return false;
2149 
2150   SDVTList VTs;
2151   if (VecSize == 1) {
2152     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2153   } else if (VecSize == 2) {
2154     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2155   } else {
2156     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2157     VTs = CurDAG->getVTList(EVTs);
2158   }
2159 
2160   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2161 
2162   SmallVector<SDValue, 2> Ops;
2163   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2164   Ops.push_back(Chain);
2165   Ops.push_back(Flag);
2166 
2167   ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2168   return true;
2169 }
2170 
2171 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2172   SDLoc DL(N);
2173   SDValue Chain = N->getOperand(0);
2174   SDValue Offset = N->getOperand(1);
2175   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2176   MemSDNode *Mem = cast<MemSDNode>(N);
2177 
2178   // How many elements do we have?
2179   unsigned NumElts = 1;
2180   switch (N->getOpcode()) {
2181   default:
2182     return false;
2183   case NVPTXISD::StoreRetval:
2184     NumElts = 1;
2185     break;
2186   case NVPTXISD::StoreRetvalV2:
2187     NumElts = 2;
2188     break;
2189   case NVPTXISD::StoreRetvalV4:
2190     NumElts = 4;
2191     break;
2192   }
2193 
2194   // Build vector of operands
2195   SmallVector<SDValue, 6> Ops;
2196   for (unsigned i = 0; i < NumElts; ++i)
2197     Ops.push_back(N->getOperand(i + 2));
2198   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2199   Ops.push_back(Chain);
2200 
2201   // Determine target opcode
2202   // If we have an i1, use an 8-bit store. The lowering code in
2203   // NVPTXISelLowering will have already emitted an upcast.
2204   Optional<unsigned> Opcode = 0;
2205   switch (NumElts) {
2206   default:
2207     return false;
2208   case 1:
2209     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2210                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2211                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2212                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2213                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2214     break;
2215   case 2:
2216     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2217                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2218                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2219                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2220                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2221     break;
2222   case 4:
2223     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2224                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2225                              NVPTX::StoreRetvalV4I32, None,
2226                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2227                              NVPTX::StoreRetvalV4F32, None);
2228     break;
2229   }
2230   if (!Opcode)
2231     return false;
2232 
2233   SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2234   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2235   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2236 
2237   ReplaceNode(N, Ret);
2238   return true;
2239 }
2240 
2241 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2242   SDLoc DL(N);
2243   SDValue Chain = N->getOperand(0);
2244   SDValue Param = N->getOperand(1);
2245   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2246   SDValue Offset = N->getOperand(2);
2247   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2248   MemSDNode *Mem = cast<MemSDNode>(N);
2249   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2250 
2251   // How many elements do we have?
2252   unsigned NumElts = 1;
2253   switch (N->getOpcode()) {
2254   default:
2255     return false;
2256   case NVPTXISD::StoreParamU32:
2257   case NVPTXISD::StoreParamS32:
2258   case NVPTXISD::StoreParam:
2259     NumElts = 1;
2260     break;
2261   case NVPTXISD::StoreParamV2:
2262     NumElts = 2;
2263     break;
2264   case NVPTXISD::StoreParamV4:
2265     NumElts = 4;
2266     break;
2267   }
2268 
2269   // Build vector of operands
2270   SmallVector<SDValue, 8> Ops;
2271   for (unsigned i = 0; i < NumElts; ++i)
2272     Ops.push_back(N->getOperand(i + 3));
2273   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2274   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2275   Ops.push_back(Chain);
2276   Ops.push_back(Flag);
2277 
2278   // Determine target opcode
2279   // If we have an i1, use an 8-bit store. The lowering code in
2280   // NVPTXISelLowering will have already emitted an upcast.
2281   Optional<unsigned> Opcode = 0;
2282   switch (N->getOpcode()) {
2283   default:
2284     switch (NumElts) {
2285     default:
2286       return false;
2287     case 1:
2288       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2289                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2290                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2291                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2292                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2293       break;
2294     case 2:
2295       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2296                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2297                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2298                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2299                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2300       break;
2301     case 4:
2302       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2303                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2304                                NVPTX::StoreParamV4I32, None,
2305                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2306                                NVPTX::StoreParamV4F32, None);
2307       break;
2308     }
2309     if (!Opcode)
2310       return false;
2311     break;
2312   // Special case: if we have a sign-extend/zero-extend node, insert the
2313   // conversion instruction first, and use that as the value operand to
2314   // the selected StoreParam node.
2315   case NVPTXISD::StoreParamU32: {
2316     Opcode = NVPTX::StoreParamI32;
2317     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2318                                                 MVT::i32);
2319     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2320                                          MVT::i32, Ops[0], CvtNone);
2321     Ops[0] = SDValue(Cvt, 0);
2322     break;
2323   }
2324   case NVPTXISD::StoreParamS32: {
2325     Opcode = NVPTX::StoreParamI32;
2326     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2327                                                 MVT::i32);
2328     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2329                                          MVT::i32, Ops[0], CvtNone);
2330     Ops[0] = SDValue(Cvt, 0);
2331     break;
2332   }
2333   }
2334 
2335   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2336   SDNode *Ret =
2337       CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2338   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2339   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2340 
2341   ReplaceNode(N, Ret);
2342   return true;
2343 }
2344 
2345 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2346   unsigned Opc = 0;
2347 
2348   switch (N->getOpcode()) {
2349   default: return false;
2350   case NVPTXISD::Tex1DFloatS32:
2351     Opc = NVPTX::TEX_1D_F32_S32_RR;
2352     break;
2353   case NVPTXISD::Tex1DFloatFloat:
2354     Opc = NVPTX::TEX_1D_F32_F32_RR;
2355     break;
2356   case NVPTXISD::Tex1DFloatFloatLevel:
2357     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2358     break;
2359   case NVPTXISD::Tex1DFloatFloatGrad:
2360     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2361     break;
2362   case NVPTXISD::Tex1DS32S32:
2363     Opc = NVPTX::TEX_1D_S32_S32_RR;
2364     break;
2365   case NVPTXISD::Tex1DS32Float:
2366     Opc = NVPTX::TEX_1D_S32_F32_RR;
2367     break;
2368   case NVPTXISD::Tex1DS32FloatLevel:
2369     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2370     break;
2371   case NVPTXISD::Tex1DS32FloatGrad:
2372     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2373     break;
2374   case NVPTXISD::Tex1DU32S32:
2375     Opc = NVPTX::TEX_1D_U32_S32_RR;
2376     break;
2377   case NVPTXISD::Tex1DU32Float:
2378     Opc = NVPTX::TEX_1D_U32_F32_RR;
2379     break;
2380   case NVPTXISD::Tex1DU32FloatLevel:
2381     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2382     break;
2383   case NVPTXISD::Tex1DU32FloatGrad:
2384     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2385     break;
2386   case NVPTXISD::Tex1DArrayFloatS32:
2387     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2388     break;
2389   case NVPTXISD::Tex1DArrayFloatFloat:
2390     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2391     break;
2392   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2393     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2394     break;
2395   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2396     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2397     break;
2398   case NVPTXISD::Tex1DArrayS32S32:
2399     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2400     break;
2401   case NVPTXISD::Tex1DArrayS32Float:
2402     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2403     break;
2404   case NVPTXISD::Tex1DArrayS32FloatLevel:
2405     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2406     break;
2407   case NVPTXISD::Tex1DArrayS32FloatGrad:
2408     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2409     break;
2410   case NVPTXISD::Tex1DArrayU32S32:
2411     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2412     break;
2413   case NVPTXISD::Tex1DArrayU32Float:
2414     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2415     break;
2416   case NVPTXISD::Tex1DArrayU32FloatLevel:
2417     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2418     break;
2419   case NVPTXISD::Tex1DArrayU32FloatGrad:
2420     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2421     break;
2422   case NVPTXISD::Tex2DFloatS32:
2423     Opc = NVPTX::TEX_2D_F32_S32_RR;
2424     break;
2425   case NVPTXISD::Tex2DFloatFloat:
2426     Opc = NVPTX::TEX_2D_F32_F32_RR;
2427     break;
2428   case NVPTXISD::Tex2DFloatFloatLevel:
2429     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2430     break;
2431   case NVPTXISD::Tex2DFloatFloatGrad:
2432     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2433     break;
2434   case NVPTXISD::Tex2DS32S32:
2435     Opc = NVPTX::TEX_2D_S32_S32_RR;
2436     break;
2437   case NVPTXISD::Tex2DS32Float:
2438     Opc = NVPTX::TEX_2D_S32_F32_RR;
2439     break;
2440   case NVPTXISD::Tex2DS32FloatLevel:
2441     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2442     break;
2443   case NVPTXISD::Tex2DS32FloatGrad:
2444     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2445     break;
2446   case NVPTXISD::Tex2DU32S32:
2447     Opc = NVPTX::TEX_2D_U32_S32_RR;
2448     break;
2449   case NVPTXISD::Tex2DU32Float:
2450     Opc = NVPTX::TEX_2D_U32_F32_RR;
2451     break;
2452   case NVPTXISD::Tex2DU32FloatLevel:
2453     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2454     break;
2455   case NVPTXISD::Tex2DU32FloatGrad:
2456     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2457     break;
2458   case NVPTXISD::Tex2DArrayFloatS32:
2459     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2460     break;
2461   case NVPTXISD::Tex2DArrayFloatFloat:
2462     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2463     break;
2464   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2465     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2466     break;
2467   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2468     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2469     break;
2470   case NVPTXISD::Tex2DArrayS32S32:
2471     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2472     break;
2473   case NVPTXISD::Tex2DArrayS32Float:
2474     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2475     break;
2476   case NVPTXISD::Tex2DArrayS32FloatLevel:
2477     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2478     break;
2479   case NVPTXISD::Tex2DArrayS32FloatGrad:
2480     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2481     break;
2482   case NVPTXISD::Tex2DArrayU32S32:
2483     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2484     break;
2485   case NVPTXISD::Tex2DArrayU32Float:
2486     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2487     break;
2488   case NVPTXISD::Tex2DArrayU32FloatLevel:
2489     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2490     break;
2491   case NVPTXISD::Tex2DArrayU32FloatGrad:
2492     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2493     break;
2494   case NVPTXISD::Tex3DFloatS32:
2495     Opc = NVPTX::TEX_3D_F32_S32_RR;
2496     break;
2497   case NVPTXISD::Tex3DFloatFloat:
2498     Opc = NVPTX::TEX_3D_F32_F32_RR;
2499     break;
2500   case NVPTXISD::Tex3DFloatFloatLevel:
2501     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2502     break;
2503   case NVPTXISD::Tex3DFloatFloatGrad:
2504     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2505     break;
2506   case NVPTXISD::Tex3DS32S32:
2507     Opc = NVPTX::TEX_3D_S32_S32_RR;
2508     break;
2509   case NVPTXISD::Tex3DS32Float:
2510     Opc = NVPTX::TEX_3D_S32_F32_RR;
2511     break;
2512   case NVPTXISD::Tex3DS32FloatLevel:
2513     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2514     break;
2515   case NVPTXISD::Tex3DS32FloatGrad:
2516     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2517     break;
2518   case NVPTXISD::Tex3DU32S32:
2519     Opc = NVPTX::TEX_3D_U32_S32_RR;
2520     break;
2521   case NVPTXISD::Tex3DU32Float:
2522     Opc = NVPTX::TEX_3D_U32_F32_RR;
2523     break;
2524   case NVPTXISD::Tex3DU32FloatLevel:
2525     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2526     break;
2527   case NVPTXISD::Tex3DU32FloatGrad:
2528     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2529     break;
2530   case NVPTXISD::TexCubeFloatFloat:
2531     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2532     break;
2533   case NVPTXISD::TexCubeFloatFloatLevel:
2534     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2535     break;
2536   case NVPTXISD::TexCubeS32Float:
2537     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2538     break;
2539   case NVPTXISD::TexCubeS32FloatLevel:
2540     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2541     break;
2542   case NVPTXISD::TexCubeU32Float:
2543     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2544     break;
2545   case NVPTXISD::TexCubeU32FloatLevel:
2546     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2547     break;
2548   case NVPTXISD::TexCubeArrayFloatFloat:
2549     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2550     break;
2551   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2552     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2553     break;
2554   case NVPTXISD::TexCubeArrayS32Float:
2555     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2556     break;
2557   case NVPTXISD::TexCubeArrayS32FloatLevel:
2558     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2559     break;
2560   case NVPTXISD::TexCubeArrayU32Float:
2561     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2562     break;
2563   case NVPTXISD::TexCubeArrayU32FloatLevel:
2564     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2565     break;
2566   case NVPTXISD::Tld4R2DFloatFloat:
2567     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2568     break;
2569   case NVPTXISD::Tld4G2DFloatFloat:
2570     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2571     break;
2572   case NVPTXISD::Tld4B2DFloatFloat:
2573     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2574     break;
2575   case NVPTXISD::Tld4A2DFloatFloat:
2576     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2577     break;
2578   case NVPTXISD::Tld4R2DS64Float:
2579     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2580     break;
2581   case NVPTXISD::Tld4G2DS64Float:
2582     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2583     break;
2584   case NVPTXISD::Tld4B2DS64Float:
2585     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2586     break;
2587   case NVPTXISD::Tld4A2DS64Float:
2588     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2589     break;
2590   case NVPTXISD::Tld4R2DU64Float:
2591     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2592     break;
2593   case NVPTXISD::Tld4G2DU64Float:
2594     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2595     break;
2596   case NVPTXISD::Tld4B2DU64Float:
2597     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2598     break;
2599   case NVPTXISD::Tld4A2DU64Float:
2600     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2601     break;
2602   case NVPTXISD::TexUnified1DFloatS32:
2603     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2604     break;
2605   case NVPTXISD::TexUnified1DFloatFloat:
2606     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2607     break;
2608   case NVPTXISD::TexUnified1DFloatFloatLevel:
2609     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2610     break;
2611   case NVPTXISD::TexUnified1DFloatFloatGrad:
2612     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2613     break;
2614   case NVPTXISD::TexUnified1DS32S32:
2615     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2616     break;
2617   case NVPTXISD::TexUnified1DS32Float:
2618     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2619     break;
2620   case NVPTXISD::TexUnified1DS32FloatLevel:
2621     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2622     break;
2623   case NVPTXISD::TexUnified1DS32FloatGrad:
2624     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2625     break;
2626   case NVPTXISD::TexUnified1DU32S32:
2627     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2628     break;
2629   case NVPTXISD::TexUnified1DU32Float:
2630     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2631     break;
2632   case NVPTXISD::TexUnified1DU32FloatLevel:
2633     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2634     break;
2635   case NVPTXISD::TexUnified1DU32FloatGrad:
2636     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2637     break;
2638   case NVPTXISD::TexUnified1DArrayFloatS32:
2639     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2640     break;
2641   case NVPTXISD::TexUnified1DArrayFloatFloat:
2642     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2643     break;
2644   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2645     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2646     break;
2647   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2648     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2649     break;
2650   case NVPTXISD::TexUnified1DArrayS32S32:
2651     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2652     break;
2653   case NVPTXISD::TexUnified1DArrayS32Float:
2654     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2655     break;
2656   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2657     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2658     break;
2659   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2660     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2661     break;
2662   case NVPTXISD::TexUnified1DArrayU32S32:
2663     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2664     break;
2665   case NVPTXISD::TexUnified1DArrayU32Float:
2666     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2667     break;
2668   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2669     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2670     break;
2671   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2672     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2673     break;
2674   case NVPTXISD::TexUnified2DFloatS32:
2675     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2676     break;
2677   case NVPTXISD::TexUnified2DFloatFloat:
2678     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2679     break;
2680   case NVPTXISD::TexUnified2DFloatFloatLevel:
2681     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2682     break;
2683   case NVPTXISD::TexUnified2DFloatFloatGrad:
2684     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2685     break;
2686   case NVPTXISD::TexUnified2DS32S32:
2687     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2688     break;
2689   case NVPTXISD::TexUnified2DS32Float:
2690     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2691     break;
2692   case NVPTXISD::TexUnified2DS32FloatLevel:
2693     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2694     break;
2695   case NVPTXISD::TexUnified2DS32FloatGrad:
2696     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2697     break;
2698   case NVPTXISD::TexUnified2DU32S32:
2699     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2700     break;
2701   case NVPTXISD::TexUnified2DU32Float:
2702     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2703     break;
2704   case NVPTXISD::TexUnified2DU32FloatLevel:
2705     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2706     break;
2707   case NVPTXISD::TexUnified2DU32FloatGrad:
2708     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2709     break;
2710   case NVPTXISD::TexUnified2DArrayFloatS32:
2711     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2712     break;
2713   case NVPTXISD::TexUnified2DArrayFloatFloat:
2714     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2715     break;
2716   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2717     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2718     break;
2719   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2720     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2721     break;
2722   case NVPTXISD::TexUnified2DArrayS32S32:
2723     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2724     break;
2725   case NVPTXISD::TexUnified2DArrayS32Float:
2726     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2727     break;
2728   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2729     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2730     break;
2731   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2732     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2733     break;
2734   case NVPTXISD::TexUnified2DArrayU32S32:
2735     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2736     break;
2737   case NVPTXISD::TexUnified2DArrayU32Float:
2738     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2739     break;
2740   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2741     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2742     break;
2743   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2744     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2745     break;
2746   case NVPTXISD::TexUnified3DFloatS32:
2747     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2748     break;
2749   case NVPTXISD::TexUnified3DFloatFloat:
2750     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2751     break;
2752   case NVPTXISD::TexUnified3DFloatFloatLevel:
2753     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2754     break;
2755   case NVPTXISD::TexUnified3DFloatFloatGrad:
2756     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2757     break;
2758   case NVPTXISD::TexUnified3DS32S32:
2759     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2760     break;
2761   case NVPTXISD::TexUnified3DS32Float:
2762     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2763     break;
2764   case NVPTXISD::TexUnified3DS32FloatLevel:
2765     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2766     break;
2767   case NVPTXISD::TexUnified3DS32FloatGrad:
2768     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2769     break;
2770   case NVPTXISD::TexUnified3DU32S32:
2771     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2772     break;
2773   case NVPTXISD::TexUnified3DU32Float:
2774     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2775     break;
2776   case NVPTXISD::TexUnified3DU32FloatLevel:
2777     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2778     break;
2779   case NVPTXISD::TexUnified3DU32FloatGrad:
2780     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2781     break;
2782   case NVPTXISD::TexUnifiedCubeFloatFloat:
2783     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2784     break;
2785   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2786     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2787     break;
2788   case NVPTXISD::TexUnifiedCubeS32Float:
2789     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2790     break;
2791   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2792     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2793     break;
2794   case NVPTXISD::TexUnifiedCubeU32Float:
2795     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2796     break;
2797   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2798     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2799     break;
2800   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2801     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2802     break;
2803   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2804     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2805     break;
2806   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2807     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2808     break;
2809   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2810     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2811     break;
2812   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2813     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2814     break;
2815   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2816     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2817     break;
2818   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2819     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2820     break;
2821   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2822     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2823     break;
2824   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2825     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2826     break;
2827   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2828     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2829     break;
2830   case NVPTXISD::Tld4UnifiedR2DS64Float:
2831     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2832     break;
2833   case NVPTXISD::Tld4UnifiedG2DS64Float:
2834     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2835     break;
2836   case NVPTXISD::Tld4UnifiedB2DS64Float:
2837     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2838     break;
2839   case NVPTXISD::Tld4UnifiedA2DS64Float:
2840     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2841     break;
2842   case NVPTXISD::Tld4UnifiedR2DU64Float:
2843     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2844     break;
2845   case NVPTXISD::Tld4UnifiedG2DU64Float:
2846     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2847     break;
2848   case NVPTXISD::Tld4UnifiedB2DU64Float:
2849     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2850     break;
2851   case NVPTXISD::Tld4UnifiedA2DU64Float:
2852     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2853     break;
2854   }
2855 
2856   // Copy over operands
2857   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2858   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2859 
2860   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2861   return true;
2862 }
2863 
2864 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2865   unsigned Opc = 0;
2866   switch (N->getOpcode()) {
2867   default: return false;
2868   case NVPTXISD::Suld1DI8Clamp:
2869     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2870     break;
2871   case NVPTXISD::Suld1DI16Clamp:
2872     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2873     break;
2874   case NVPTXISD::Suld1DI32Clamp:
2875     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2876     break;
2877   case NVPTXISD::Suld1DI64Clamp:
2878     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2879     break;
2880   case NVPTXISD::Suld1DV2I8Clamp:
2881     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2882     break;
2883   case NVPTXISD::Suld1DV2I16Clamp:
2884     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2885     break;
2886   case NVPTXISD::Suld1DV2I32Clamp:
2887     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2888     break;
2889   case NVPTXISD::Suld1DV2I64Clamp:
2890     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2891     break;
2892   case NVPTXISD::Suld1DV4I8Clamp:
2893     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2894     break;
2895   case NVPTXISD::Suld1DV4I16Clamp:
2896     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2897     break;
2898   case NVPTXISD::Suld1DV4I32Clamp:
2899     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2900     break;
2901   case NVPTXISD::Suld1DArrayI8Clamp:
2902     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2903     break;
2904   case NVPTXISD::Suld1DArrayI16Clamp:
2905     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2906     break;
2907   case NVPTXISD::Suld1DArrayI32Clamp:
2908     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2909     break;
2910   case NVPTXISD::Suld1DArrayI64Clamp:
2911     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2912     break;
2913   case NVPTXISD::Suld1DArrayV2I8Clamp:
2914     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2915     break;
2916   case NVPTXISD::Suld1DArrayV2I16Clamp:
2917     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2918     break;
2919   case NVPTXISD::Suld1DArrayV2I32Clamp:
2920     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2921     break;
2922   case NVPTXISD::Suld1DArrayV2I64Clamp:
2923     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2924     break;
2925   case NVPTXISD::Suld1DArrayV4I8Clamp:
2926     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2927     break;
2928   case NVPTXISD::Suld1DArrayV4I16Clamp:
2929     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2930     break;
2931   case NVPTXISD::Suld1DArrayV4I32Clamp:
2932     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2933     break;
2934   case NVPTXISD::Suld2DI8Clamp:
2935     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2936     break;
2937   case NVPTXISD::Suld2DI16Clamp:
2938     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2939     break;
2940   case NVPTXISD::Suld2DI32Clamp:
2941     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2942     break;
2943   case NVPTXISD::Suld2DI64Clamp:
2944     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2945     break;
2946   case NVPTXISD::Suld2DV2I8Clamp:
2947     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2948     break;
2949   case NVPTXISD::Suld2DV2I16Clamp:
2950     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2951     break;
2952   case NVPTXISD::Suld2DV2I32Clamp:
2953     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2954     break;
2955   case NVPTXISD::Suld2DV2I64Clamp:
2956     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2957     break;
2958   case NVPTXISD::Suld2DV4I8Clamp:
2959     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2960     break;
2961   case NVPTXISD::Suld2DV4I16Clamp:
2962     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2963     break;
2964   case NVPTXISD::Suld2DV4I32Clamp:
2965     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2966     break;
2967   case NVPTXISD::Suld2DArrayI8Clamp:
2968     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2969     break;
2970   case NVPTXISD::Suld2DArrayI16Clamp:
2971     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2972     break;
2973   case NVPTXISD::Suld2DArrayI32Clamp:
2974     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2975     break;
2976   case NVPTXISD::Suld2DArrayI64Clamp:
2977     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2978     break;
2979   case NVPTXISD::Suld2DArrayV2I8Clamp:
2980     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2981     break;
2982   case NVPTXISD::Suld2DArrayV2I16Clamp:
2983     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2984     break;
2985   case NVPTXISD::Suld2DArrayV2I32Clamp:
2986     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2987     break;
2988   case NVPTXISD::Suld2DArrayV2I64Clamp:
2989     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2990     break;
2991   case NVPTXISD::Suld2DArrayV4I8Clamp:
2992     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2993     break;
2994   case NVPTXISD::Suld2DArrayV4I16Clamp:
2995     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2996     break;
2997   case NVPTXISD::Suld2DArrayV4I32Clamp:
2998     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2999     break;
3000   case NVPTXISD::Suld3DI8Clamp:
3001     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3002     break;
3003   case NVPTXISD::Suld3DI16Clamp:
3004     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3005     break;
3006   case NVPTXISD::Suld3DI32Clamp:
3007     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3008     break;
3009   case NVPTXISD::Suld3DI64Clamp:
3010     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3011     break;
3012   case NVPTXISD::Suld3DV2I8Clamp:
3013     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3014     break;
3015   case NVPTXISD::Suld3DV2I16Clamp:
3016     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3017     break;
3018   case NVPTXISD::Suld3DV2I32Clamp:
3019     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3020     break;
3021   case NVPTXISD::Suld3DV2I64Clamp:
3022     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3023     break;
3024   case NVPTXISD::Suld3DV4I8Clamp:
3025     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3026     break;
3027   case NVPTXISD::Suld3DV4I16Clamp:
3028     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3029     break;
3030   case NVPTXISD::Suld3DV4I32Clamp:
3031     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3032     break;
3033   case NVPTXISD::Suld1DI8Trap:
3034     Opc = NVPTX::SULD_1D_I8_TRAP_R;
3035     break;
3036   case NVPTXISD::Suld1DI16Trap:
3037     Opc = NVPTX::SULD_1D_I16_TRAP_R;
3038     break;
3039   case NVPTXISD::Suld1DI32Trap:
3040     Opc = NVPTX::SULD_1D_I32_TRAP_R;
3041     break;
3042   case NVPTXISD::Suld1DI64Trap:
3043     Opc = NVPTX::SULD_1D_I64_TRAP_R;
3044     break;
3045   case NVPTXISD::Suld1DV2I8Trap:
3046     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3047     break;
3048   case NVPTXISD::Suld1DV2I16Trap:
3049     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3050     break;
3051   case NVPTXISD::Suld1DV2I32Trap:
3052     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3053     break;
3054   case NVPTXISD::Suld1DV2I64Trap:
3055     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3056     break;
3057   case NVPTXISD::Suld1DV4I8Trap:
3058     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3059     break;
3060   case NVPTXISD::Suld1DV4I16Trap:
3061     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3062     break;
3063   case NVPTXISD::Suld1DV4I32Trap:
3064     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3065     break;
3066   case NVPTXISD::Suld1DArrayI8Trap:
3067     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3068     break;
3069   case NVPTXISD::Suld1DArrayI16Trap:
3070     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3071     break;
3072   case NVPTXISD::Suld1DArrayI32Trap:
3073     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3074     break;
3075   case NVPTXISD::Suld1DArrayI64Trap:
3076     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3077     break;
3078   case NVPTXISD::Suld1DArrayV2I8Trap:
3079     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3080     break;
3081   case NVPTXISD::Suld1DArrayV2I16Trap:
3082     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3083     break;
3084   case NVPTXISD::Suld1DArrayV2I32Trap:
3085     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3086     break;
3087   case NVPTXISD::Suld1DArrayV2I64Trap:
3088     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3089     break;
3090   case NVPTXISD::Suld1DArrayV4I8Trap:
3091     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3092     break;
3093   case NVPTXISD::Suld1DArrayV4I16Trap:
3094     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3095     break;
3096   case NVPTXISD::Suld1DArrayV4I32Trap:
3097     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3098     break;
3099   case NVPTXISD::Suld2DI8Trap:
3100     Opc = NVPTX::SULD_2D_I8_TRAP_R;
3101     break;
3102   case NVPTXISD::Suld2DI16Trap:
3103     Opc = NVPTX::SULD_2D_I16_TRAP_R;
3104     break;
3105   case NVPTXISD::Suld2DI32Trap:
3106     Opc = NVPTX::SULD_2D_I32_TRAP_R;
3107     break;
3108   case NVPTXISD::Suld2DI64Trap:
3109     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3110     break;
3111   case NVPTXISD::Suld2DV2I8Trap:
3112     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3113     break;
3114   case NVPTXISD::Suld2DV2I16Trap:
3115     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3116     break;
3117   case NVPTXISD::Suld2DV2I32Trap:
3118     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3119     break;
3120   case NVPTXISD::Suld2DV2I64Trap:
3121     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3122     break;
3123   case NVPTXISD::Suld2DV4I8Trap:
3124     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3125     break;
3126   case NVPTXISD::Suld2DV4I16Trap:
3127     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3128     break;
3129   case NVPTXISD::Suld2DV4I32Trap:
3130     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3131     break;
3132   case NVPTXISD::Suld2DArrayI8Trap:
3133     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3134     break;
3135   case NVPTXISD::Suld2DArrayI16Trap:
3136     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3137     break;
3138   case NVPTXISD::Suld2DArrayI32Trap:
3139     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3140     break;
3141   case NVPTXISD::Suld2DArrayI64Trap:
3142     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3143     break;
3144   case NVPTXISD::Suld2DArrayV2I8Trap:
3145     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3146     break;
3147   case NVPTXISD::Suld2DArrayV2I16Trap:
3148     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3149     break;
3150   case NVPTXISD::Suld2DArrayV2I32Trap:
3151     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3152     break;
3153   case NVPTXISD::Suld2DArrayV2I64Trap:
3154     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3155     break;
3156   case NVPTXISD::Suld2DArrayV4I8Trap:
3157     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3158     break;
3159   case NVPTXISD::Suld2DArrayV4I16Trap:
3160     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3161     break;
3162   case NVPTXISD::Suld2DArrayV4I32Trap:
3163     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3164     break;
3165   case NVPTXISD::Suld3DI8Trap:
3166     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3167     break;
3168   case NVPTXISD::Suld3DI16Trap:
3169     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3170     break;
3171   case NVPTXISD::Suld3DI32Trap:
3172     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3173     break;
3174   case NVPTXISD::Suld3DI64Trap:
3175     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3176     break;
3177   case NVPTXISD::Suld3DV2I8Trap:
3178     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3179     break;
3180   case NVPTXISD::Suld3DV2I16Trap:
3181     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3182     break;
3183   case NVPTXISD::Suld3DV2I32Trap:
3184     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3185     break;
3186   case NVPTXISD::Suld3DV2I64Trap:
3187     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3188     break;
3189   case NVPTXISD::Suld3DV4I8Trap:
3190     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3191     break;
3192   case NVPTXISD::Suld3DV4I16Trap:
3193     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3194     break;
3195   case NVPTXISD::Suld3DV4I32Trap:
3196     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3197     break;
3198   case NVPTXISD::Suld1DI8Zero:
3199     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3200     break;
3201   case NVPTXISD::Suld1DI16Zero:
3202     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3203     break;
3204   case NVPTXISD::Suld1DI32Zero:
3205     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3206     break;
3207   case NVPTXISD::Suld1DI64Zero:
3208     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3209     break;
3210   case NVPTXISD::Suld1DV2I8Zero:
3211     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3212     break;
3213   case NVPTXISD::Suld1DV2I16Zero:
3214     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3215     break;
3216   case NVPTXISD::Suld1DV2I32Zero:
3217     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3218     break;
3219   case NVPTXISD::Suld1DV2I64Zero:
3220     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3221     break;
3222   case NVPTXISD::Suld1DV4I8Zero:
3223     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3224     break;
3225   case NVPTXISD::Suld1DV4I16Zero:
3226     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3227     break;
3228   case NVPTXISD::Suld1DV4I32Zero:
3229     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3230     break;
3231   case NVPTXISD::Suld1DArrayI8Zero:
3232     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3233     break;
3234   case NVPTXISD::Suld1DArrayI16Zero:
3235     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3236     break;
3237   case NVPTXISD::Suld1DArrayI32Zero:
3238     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3239     break;
3240   case NVPTXISD::Suld1DArrayI64Zero:
3241     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3242     break;
3243   case NVPTXISD::Suld1DArrayV2I8Zero:
3244     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3245     break;
3246   case NVPTXISD::Suld1DArrayV2I16Zero:
3247     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3248     break;
3249   case NVPTXISD::Suld1DArrayV2I32Zero:
3250     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3251     break;
3252   case NVPTXISD::Suld1DArrayV2I64Zero:
3253     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3254     break;
3255   case NVPTXISD::Suld1DArrayV4I8Zero:
3256     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3257     break;
3258   case NVPTXISD::Suld1DArrayV4I16Zero:
3259     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3260     break;
3261   case NVPTXISD::Suld1DArrayV4I32Zero:
3262     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3263     break;
3264   case NVPTXISD::Suld2DI8Zero:
3265     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3266     break;
3267   case NVPTXISD::Suld2DI16Zero:
3268     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3269     break;
3270   case NVPTXISD::Suld2DI32Zero:
3271     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3272     break;
3273   case NVPTXISD::Suld2DI64Zero:
3274     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3275     break;
3276   case NVPTXISD::Suld2DV2I8Zero:
3277     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3278     break;
3279   case NVPTXISD::Suld2DV2I16Zero:
3280     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3281     break;
3282   case NVPTXISD::Suld2DV2I32Zero:
3283     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3284     break;
3285   case NVPTXISD::Suld2DV2I64Zero:
3286     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3287     break;
3288   case NVPTXISD::Suld2DV4I8Zero:
3289     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3290     break;
3291   case NVPTXISD::Suld2DV4I16Zero:
3292     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3293     break;
3294   case NVPTXISD::Suld2DV4I32Zero:
3295     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3296     break;
3297   case NVPTXISD::Suld2DArrayI8Zero:
3298     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3299     break;
3300   case NVPTXISD::Suld2DArrayI16Zero:
3301     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3302     break;
3303   case NVPTXISD::Suld2DArrayI32Zero:
3304     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3305     break;
3306   case NVPTXISD::Suld2DArrayI64Zero:
3307     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3308     break;
3309   case NVPTXISD::Suld2DArrayV2I8Zero:
3310     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3311     break;
3312   case NVPTXISD::Suld2DArrayV2I16Zero:
3313     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3314     break;
3315   case NVPTXISD::Suld2DArrayV2I32Zero:
3316     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3317     break;
3318   case NVPTXISD::Suld2DArrayV2I64Zero:
3319     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3320     break;
3321   case NVPTXISD::Suld2DArrayV4I8Zero:
3322     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3323     break;
3324   case NVPTXISD::Suld2DArrayV4I16Zero:
3325     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3326     break;
3327   case NVPTXISD::Suld2DArrayV4I32Zero:
3328     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3329     break;
3330   case NVPTXISD::Suld3DI8Zero:
3331     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3332     break;
3333   case NVPTXISD::Suld3DI16Zero:
3334     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3335     break;
3336   case NVPTXISD::Suld3DI32Zero:
3337     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3338     break;
3339   case NVPTXISD::Suld3DI64Zero:
3340     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3341     break;
3342   case NVPTXISD::Suld3DV2I8Zero:
3343     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3344     break;
3345   case NVPTXISD::Suld3DV2I16Zero:
3346     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3347     break;
3348   case NVPTXISD::Suld3DV2I32Zero:
3349     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3350     break;
3351   case NVPTXISD::Suld3DV2I64Zero:
3352     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3353     break;
3354   case NVPTXISD::Suld3DV4I8Zero:
3355     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3356     break;
3357   case NVPTXISD::Suld3DV4I16Zero:
3358     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3359     break;
3360   case NVPTXISD::Suld3DV4I32Zero:
3361     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3362     break;
3363   }
3364 
3365   // Copy over operands
3366   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3367   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3368 
3369   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3370   return true;
3371 }
3372 
3373 
3374 /// SelectBFE - Look for instruction sequences that can be made more efficient
3375 /// by using the 'bfe' (bit-field extract) PTX instruction
3376 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3377   SDLoc DL(N);
3378   SDValue LHS = N->getOperand(0);
3379   SDValue RHS = N->getOperand(1);
3380   SDValue Len;
3381   SDValue Start;
3382   SDValue Val;
3383   bool IsSigned = false;
3384 
3385   if (N->getOpcode() == ISD::AND) {
3386     // Canonicalize the operands
3387     // We want 'and %val, %mask'
3388     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3389       std::swap(LHS, RHS);
3390     }
3391 
3392     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3393     if (!Mask) {
3394       // We need a constant mask on the RHS of the AND
3395       return false;
3396     }
3397 
3398     // Extract the mask bits
3399     uint64_t MaskVal = Mask->getZExtValue();
3400     if (!isMask_64(MaskVal)) {
3401       // We *could* handle shifted masks here, but doing so would require an
3402       // 'and' operation to fix up the low-order bits so we would trade
3403       // shr+and for bfe+and, which has the same throughput
3404       return false;
3405     }
3406 
3407     // How many bits are in our mask?
3408     uint64_t NumBits = countTrailingOnes(MaskVal);
3409     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3410 
3411     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3412       // We have a 'srl/and' pair, extract the effective start bit and length
3413       Val = LHS.getNode()->getOperand(0);
3414       Start = LHS.getNode()->getOperand(1);
3415       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3416       if (StartConst) {
3417         uint64_t StartVal = StartConst->getZExtValue();
3418         // How many "good" bits do we have left?  "good" is defined here as bits
3419         // that exist in the original value, not shifted in.
3420         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3421         if (NumBits > GoodBits) {
3422           // Do not handle the case where bits have been shifted in. In theory
3423           // we could handle this, but the cost is likely higher than just
3424           // emitting the srl/and pair.
3425           return false;
3426         }
3427         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3428       } else {
3429         // Do not handle the case where the shift amount (can be zero if no srl
3430         // was found) is not constant. We could handle this case, but it would
3431         // require run-time logic that would be more expensive than just
3432         // emitting the srl/and pair.
3433         return false;
3434       }
3435     } else {
3436       // Do not handle the case where the LHS of the and is not a shift. While
3437       // it would be trivial to handle this case, it would just transform
3438       // 'and' -> 'bfe', but 'and' has higher-throughput.
3439       return false;
3440     }
3441   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3442     if (LHS->getOpcode() == ISD::AND) {
3443       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3444       if (!ShiftCnst) {
3445         // Shift amount must be constant
3446         return false;
3447       }
3448 
3449       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3450 
3451       SDValue AndLHS = LHS->getOperand(0);
3452       SDValue AndRHS = LHS->getOperand(1);
3453 
3454       // Canonicalize the AND to have the mask on the RHS
3455       if (isa<ConstantSDNode>(AndLHS)) {
3456         std::swap(AndLHS, AndRHS);
3457       }
3458 
3459       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3460       if (!MaskCnst) {
3461         // Mask must be constant
3462         return false;
3463       }
3464 
3465       uint64_t MaskVal = MaskCnst->getZExtValue();
3466       uint64_t NumZeros;
3467       uint64_t NumBits;
3468       if (isMask_64(MaskVal)) {
3469         NumZeros = 0;
3470         // The number of bits in the result bitfield will be the number of
3471         // trailing ones (the AND) minus the number of bits we shift off
3472         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3473       } else if (isShiftedMask_64(MaskVal)) {
3474         NumZeros = countTrailingZeros(MaskVal);
3475         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3476         // The number of bits in the result bitfield will be the number of
3477         // trailing zeros plus the number of set bits in the mask minus the
3478         // number of bits we shift off
3479         NumBits = NumZeros + NumOnes - ShiftAmt;
3480       } else {
3481         // This is not a mask we can handle
3482         return false;
3483       }
3484 
3485       if (ShiftAmt < NumZeros) {
3486         // Handling this case would require extra logic that would make this
3487         // transformation non-profitable
3488         return false;
3489       }
3490 
3491       Val = AndLHS;
3492       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3493       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3494     } else if (LHS->getOpcode() == ISD::SHL) {
3495       // Here, we have a pattern like:
3496       //
3497       // (sra (shl val, NN), MM)
3498       // or
3499       // (srl (shl val, NN), MM)
3500       //
3501       // If MM >= NN, we can efficiently optimize this with bfe
3502       Val = LHS->getOperand(0);
3503 
3504       SDValue ShlRHS = LHS->getOperand(1);
3505       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3506       if (!ShlCnst) {
3507         // Shift amount must be constant
3508         return false;
3509       }
3510       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3511 
3512       SDValue ShrRHS = RHS;
3513       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3514       if (!ShrCnst) {
3515         // Shift amount must be constant
3516         return false;
3517       }
3518       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3519 
3520       // To avoid extra codegen and be profitable, we need Outer >= Inner
3521       if (OuterShiftAmt < InnerShiftAmt) {
3522         return false;
3523       }
3524 
3525       // If the outer shift is more than the type size, we have no bitfield to
3526       // extract (since we also check that the inner shift is <= the outer shift
3527       // then this also implies that the inner shift is < the type size)
3528       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3529         return false;
3530       }
3531 
3532       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3533                                         MVT::i32);
3534       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3535                                       DL, MVT::i32);
3536 
3537       if (N->getOpcode() == ISD::SRA) {
3538         // If we have a arithmetic right shift, we need to use the signed bfe
3539         // variant
3540         IsSigned = true;
3541       }
3542     } else {
3543       // No can do...
3544       return false;
3545     }
3546   } else {
3547     // No can do...
3548     return false;
3549   }
3550 
3551 
3552   unsigned Opc;
3553   // For the BFE operations we form here from "and" and "srl", always use the
3554   // unsigned variants.
3555   if (Val.getValueType() == MVT::i32) {
3556     if (IsSigned) {
3557       Opc = NVPTX::BFE_S32rii;
3558     } else {
3559       Opc = NVPTX::BFE_U32rii;
3560     }
3561   } else if (Val.getValueType() == MVT::i64) {
3562     if (IsSigned) {
3563       Opc = NVPTX::BFE_S64rii;
3564     } else {
3565       Opc = NVPTX::BFE_U64rii;
3566     }
3567   } else {
3568     // We cannot handle this type
3569     return false;
3570   }
3571 
3572   SDValue Ops[] = {
3573     Val, Start, Len
3574   };
3575 
3576   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3577   return true;
3578 }
3579 
3580 // SelectDirectAddr - Match a direct address for DAG.
3581 // A direct address could be a globaladdress or externalsymbol.
3582 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3583   // Return true if TGA or ES.
3584   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3585       N.getOpcode() == ISD::TargetExternalSymbol) {
3586     Address = N;
3587     return true;
3588   }
3589   if (N.getOpcode() == NVPTXISD::Wrapper) {
3590     Address = N.getOperand(0);
3591     return true;
3592   }
3593   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3595     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3596         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3597         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3598       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3599   }
3600   return false;
3601 }
3602 
3603 // symbol+offset
3604 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3606   if (Addr.getOpcode() == ISD::ADD) {
3607     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3608       SDValue base = Addr.getOperand(0);
3609       if (SelectDirectAddr(base, Base)) {
3610         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3611                                            mvt);
3612         return true;
3613       }
3614     }
3615   }
3616   return false;
3617 }
3618 
3619 // symbol+offset
3620 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3621                                      SDValue &Base, SDValue &Offset) {
3622   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3623 }
3624 
3625 // symbol+offset
3626 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3627                                        SDValue &Base, SDValue &Offset) {
3628   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3629 }
3630 
3631 // register+offset
3632 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3634   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3635     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3636     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3637     return true;
3638   }
3639   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3640       Addr.getOpcode() == ISD::TargetGlobalAddress)
3641     return false; // direct calls.
3642 
3643   if (Addr.getOpcode() == ISD::ADD) {
3644     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3645       return false;
3646     }
3647     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3648       if (FrameIndexSDNode *FIN =
3649               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3650         // Constant offset from frame ref.
3651         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3652       else
3653         Base = Addr.getOperand(0);
3654       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3655                                          mvt);
3656       return true;
3657     }
3658   }
3659   return false;
3660 }
3661 
3662 // register+offset
3663 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3664                                      SDValue &Base, SDValue &Offset) {
3665   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3666 }
3667 
3668 // register+offset
3669 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3670                                        SDValue &Base, SDValue &Offset) {
3671   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3672 }
3673 
3674 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3675                                                  unsigned int spN) const {
3676   const Value *Src = nullptr;
3677   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3678     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3679       return true;
3680     Src = mN->getMemOperand()->getValue();
3681   }
3682   if (!Src)
3683     return false;
3684   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3685     return (PT->getAddressSpace() == spN);
3686   return false;
3687 }
3688 
3689 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690 /// inline asm expressions.
3691 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3692     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3693   SDValue Op0, Op1;
3694   switch (ConstraintID) {
3695   default:
3696     return true;
3697   case InlineAsm::Constraint_m: // memory
3698     if (SelectDirectAddr(Op, Op0)) {
3699       OutOps.push_back(Op0);
3700       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3701       return false;
3702     }
3703     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3704       OutOps.push_back(Op0);
3705       OutOps.push_back(Op1);
3706       return false;
3707     }
3708     break;
3709   }
3710   return true;
3711 }
3712 
3713 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714 /// conversion from \p SrcTy to \p DestTy.
3715 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3716                                              bool IsSigned) {
3717   switch (SrcTy.SimpleTy) {
3718   default:
3719     llvm_unreachable("Unhandled source type");
3720   case MVT::i8:
3721     switch (DestTy.SimpleTy) {
3722     default:
3723       llvm_unreachable("Unhandled dest type");
3724     case MVT::i16:
3725       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3726     case MVT::i32:
3727       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3728     case MVT::i64:
3729       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3730     }
3731   case MVT::i16:
3732     switch (DestTy.SimpleTy) {
3733     default:
3734       llvm_unreachable("Unhandled dest type");
3735     case MVT::i8:
3736       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3737     case MVT::i32:
3738       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3739     case MVT::i64:
3740       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3741     }
3742   case MVT::i32:
3743     switch (DestTy.SimpleTy) {
3744     default:
3745       llvm_unreachable("Unhandled dest type");
3746     case MVT::i8:
3747       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3748     case MVT::i16:
3749       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3750     case MVT::i64:
3751       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3752     }
3753   case MVT::i64:
3754     switch (DestTy.SimpleTy) {
3755     default:
3756       llvm_unreachable("Unhandled dest type");
3757     case MVT::i8:
3758       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3759     case MVT::i16:
3760       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3761     case MVT::i32:
3762       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3763     }
3764   }
3765 }
3766