xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/CodeGen/ISDOpcodes.h"
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Instructions.h"
20 #include "llvm/IR/IntrinsicsNVPTX.h"
21 #include "llvm/Support/AtomicOrdering.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/raw_ostream.h"
26 #include "llvm/Target/TargetIntrinsicInfo.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "nvptx-isel"
31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32 
33 /// createNVPTXISelDag - This pass converts a legalized DAG into a
34 /// NVPTX-specific DAG, ready for instruction scheduling.
35 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
36                                        llvm::CodeGenOptLevel OptLevel) {
37   return new NVPTXDAGToDAGISel(TM, OptLevel);
38 }
39 
40 char NVPTXDAGToDAGISel::ID = 0;
41 
42 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
43 
44 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
45                                      CodeGenOptLevel OptLevel)
46     : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
47   doMulWide = (OptLevel > CodeGenOptLevel::None);
48 }
49 
50 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
51   Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
52   return SelectionDAGISel::runOnMachineFunction(MF);
53 }
54 
55 int NVPTXDAGToDAGISel::getDivF32Level() const {
56   return Subtarget->getTargetLowering()->getDivF32Level();
57 }
58 
59 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
60   return Subtarget->getTargetLowering()->usePrecSqrtF32();
61 }
62 
63 bool NVPTXDAGToDAGISel::useF32FTZ() const {
64   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
65 }
66 
67 bool NVPTXDAGToDAGISel::allowFMA() const {
68   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
69   return TL->allowFMA(*MF, OptLevel);
70 }
71 
72 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
73   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
74   return TL->allowUnsafeFPMath(*MF);
75 }
76 
77 bool NVPTXDAGToDAGISel::useShortPointers() const {
78   return TM.useShortPointers();
79 }
80 
81 /// Select - Select instructions not customized! Used for
82 /// expanded, promoted and normal instructions.
83 void NVPTXDAGToDAGISel::Select(SDNode *N) {
84 
85   if (N->isMachineOpcode()) {
86     N->setNodeId(-1);
87     return; // Already selected.
88   }
89 
90   switch (N->getOpcode()) {
91   case ISD::LOAD:
92   case ISD::ATOMIC_LOAD:
93     if (tryLoad(N))
94       return;
95     break;
96   case ISD::STORE:
97   case ISD::ATOMIC_STORE:
98     if (tryStore(N))
99       return;
100     break;
101   case ISD::EXTRACT_VECTOR_ELT:
102     if (tryEXTRACT_VECTOR_ELEMENT(N))
103       return;
104     break;
105   case NVPTXISD::SETP_F16X2:
106     SelectSETP_F16X2(N);
107     return;
108   case NVPTXISD::SETP_BF16X2:
109     SelectSETP_BF16X2(N);
110     return;
111   case NVPTXISD::LoadV2:
112   case NVPTXISD::LoadV4:
113     if (tryLoadVector(N))
114       return;
115     break;
116   case NVPTXISD::LDGV2:
117   case NVPTXISD::LDGV4:
118   case NVPTXISD::LDUV2:
119   case NVPTXISD::LDUV4:
120     if (tryLDGLDU(N))
121       return;
122     break;
123   case NVPTXISD::StoreV2:
124   case NVPTXISD::StoreV4:
125     if (tryStoreVector(N))
126       return;
127     break;
128   case NVPTXISD::LoadParam:
129   case NVPTXISD::LoadParamV2:
130   case NVPTXISD::LoadParamV4:
131     if (tryLoadParam(N))
132       return;
133     break;
134   case NVPTXISD::StoreRetval:
135   case NVPTXISD::StoreRetvalV2:
136   case NVPTXISD::StoreRetvalV4:
137     if (tryStoreRetval(N))
138       return;
139     break;
140   case NVPTXISD::StoreParam:
141   case NVPTXISD::StoreParamV2:
142   case NVPTXISD::StoreParamV4:
143   case NVPTXISD::StoreParamS32:
144   case NVPTXISD::StoreParamU32:
145     if (tryStoreParam(N))
146       return;
147     break;
148   case ISD::INTRINSIC_WO_CHAIN:
149     if (tryIntrinsicNoChain(N))
150       return;
151     break;
152   case ISD::INTRINSIC_W_CHAIN:
153     if (tryIntrinsicChain(N))
154       return;
155     break;
156   case NVPTXISD::Tex1DFloatS32:
157   case NVPTXISD::Tex1DFloatFloat:
158   case NVPTXISD::Tex1DFloatFloatLevel:
159   case NVPTXISD::Tex1DFloatFloatGrad:
160   case NVPTXISD::Tex1DS32S32:
161   case NVPTXISD::Tex1DS32Float:
162   case NVPTXISD::Tex1DS32FloatLevel:
163   case NVPTXISD::Tex1DS32FloatGrad:
164   case NVPTXISD::Tex1DU32S32:
165   case NVPTXISD::Tex1DU32Float:
166   case NVPTXISD::Tex1DU32FloatLevel:
167   case NVPTXISD::Tex1DU32FloatGrad:
168   case NVPTXISD::Tex1DArrayFloatS32:
169   case NVPTXISD::Tex1DArrayFloatFloat:
170   case NVPTXISD::Tex1DArrayFloatFloatLevel:
171   case NVPTXISD::Tex1DArrayFloatFloatGrad:
172   case NVPTXISD::Tex1DArrayS32S32:
173   case NVPTXISD::Tex1DArrayS32Float:
174   case NVPTXISD::Tex1DArrayS32FloatLevel:
175   case NVPTXISD::Tex1DArrayS32FloatGrad:
176   case NVPTXISD::Tex1DArrayU32S32:
177   case NVPTXISD::Tex1DArrayU32Float:
178   case NVPTXISD::Tex1DArrayU32FloatLevel:
179   case NVPTXISD::Tex1DArrayU32FloatGrad:
180   case NVPTXISD::Tex2DFloatS32:
181   case NVPTXISD::Tex2DFloatFloat:
182   case NVPTXISD::Tex2DFloatFloatLevel:
183   case NVPTXISD::Tex2DFloatFloatGrad:
184   case NVPTXISD::Tex2DS32S32:
185   case NVPTXISD::Tex2DS32Float:
186   case NVPTXISD::Tex2DS32FloatLevel:
187   case NVPTXISD::Tex2DS32FloatGrad:
188   case NVPTXISD::Tex2DU32S32:
189   case NVPTXISD::Tex2DU32Float:
190   case NVPTXISD::Tex2DU32FloatLevel:
191   case NVPTXISD::Tex2DU32FloatGrad:
192   case NVPTXISD::Tex2DArrayFloatS32:
193   case NVPTXISD::Tex2DArrayFloatFloat:
194   case NVPTXISD::Tex2DArrayFloatFloatLevel:
195   case NVPTXISD::Tex2DArrayFloatFloatGrad:
196   case NVPTXISD::Tex2DArrayS32S32:
197   case NVPTXISD::Tex2DArrayS32Float:
198   case NVPTXISD::Tex2DArrayS32FloatLevel:
199   case NVPTXISD::Tex2DArrayS32FloatGrad:
200   case NVPTXISD::Tex2DArrayU32S32:
201   case NVPTXISD::Tex2DArrayU32Float:
202   case NVPTXISD::Tex2DArrayU32FloatLevel:
203   case NVPTXISD::Tex2DArrayU32FloatGrad:
204   case NVPTXISD::Tex3DFloatS32:
205   case NVPTXISD::Tex3DFloatFloat:
206   case NVPTXISD::Tex3DFloatFloatLevel:
207   case NVPTXISD::Tex3DFloatFloatGrad:
208   case NVPTXISD::Tex3DS32S32:
209   case NVPTXISD::Tex3DS32Float:
210   case NVPTXISD::Tex3DS32FloatLevel:
211   case NVPTXISD::Tex3DS32FloatGrad:
212   case NVPTXISD::Tex3DU32S32:
213   case NVPTXISD::Tex3DU32Float:
214   case NVPTXISD::Tex3DU32FloatLevel:
215   case NVPTXISD::Tex3DU32FloatGrad:
216   case NVPTXISD::TexCubeFloatFloat:
217   case NVPTXISD::TexCubeFloatFloatLevel:
218   case NVPTXISD::TexCubeS32Float:
219   case NVPTXISD::TexCubeS32FloatLevel:
220   case NVPTXISD::TexCubeU32Float:
221   case NVPTXISD::TexCubeU32FloatLevel:
222   case NVPTXISD::TexCubeArrayFloatFloat:
223   case NVPTXISD::TexCubeArrayFloatFloatLevel:
224   case NVPTXISD::TexCubeArrayS32Float:
225   case NVPTXISD::TexCubeArrayS32FloatLevel:
226   case NVPTXISD::TexCubeArrayU32Float:
227   case NVPTXISD::TexCubeArrayU32FloatLevel:
228   case NVPTXISD::Tld4R2DFloatFloat:
229   case NVPTXISD::Tld4G2DFloatFloat:
230   case NVPTXISD::Tld4B2DFloatFloat:
231   case NVPTXISD::Tld4A2DFloatFloat:
232   case NVPTXISD::Tld4R2DS64Float:
233   case NVPTXISD::Tld4G2DS64Float:
234   case NVPTXISD::Tld4B2DS64Float:
235   case NVPTXISD::Tld4A2DS64Float:
236   case NVPTXISD::Tld4R2DU64Float:
237   case NVPTXISD::Tld4G2DU64Float:
238   case NVPTXISD::Tld4B2DU64Float:
239   case NVPTXISD::Tld4A2DU64Float:
240   case NVPTXISD::TexUnified1DFloatS32:
241   case NVPTXISD::TexUnified1DFloatFloat:
242   case NVPTXISD::TexUnified1DFloatFloatLevel:
243   case NVPTXISD::TexUnified1DFloatFloatGrad:
244   case NVPTXISD::TexUnified1DS32S32:
245   case NVPTXISD::TexUnified1DS32Float:
246   case NVPTXISD::TexUnified1DS32FloatLevel:
247   case NVPTXISD::TexUnified1DS32FloatGrad:
248   case NVPTXISD::TexUnified1DU32S32:
249   case NVPTXISD::TexUnified1DU32Float:
250   case NVPTXISD::TexUnified1DU32FloatLevel:
251   case NVPTXISD::TexUnified1DU32FloatGrad:
252   case NVPTXISD::TexUnified1DArrayFloatS32:
253   case NVPTXISD::TexUnified1DArrayFloatFloat:
254   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
255   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
256   case NVPTXISD::TexUnified1DArrayS32S32:
257   case NVPTXISD::TexUnified1DArrayS32Float:
258   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
259   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
260   case NVPTXISD::TexUnified1DArrayU32S32:
261   case NVPTXISD::TexUnified1DArrayU32Float:
262   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
263   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
264   case NVPTXISD::TexUnified2DFloatS32:
265   case NVPTXISD::TexUnified2DFloatFloat:
266   case NVPTXISD::TexUnified2DFloatFloatLevel:
267   case NVPTXISD::TexUnified2DFloatFloatGrad:
268   case NVPTXISD::TexUnified2DS32S32:
269   case NVPTXISD::TexUnified2DS32Float:
270   case NVPTXISD::TexUnified2DS32FloatLevel:
271   case NVPTXISD::TexUnified2DS32FloatGrad:
272   case NVPTXISD::TexUnified2DU32S32:
273   case NVPTXISD::TexUnified2DU32Float:
274   case NVPTXISD::TexUnified2DU32FloatLevel:
275   case NVPTXISD::TexUnified2DU32FloatGrad:
276   case NVPTXISD::TexUnified2DArrayFloatS32:
277   case NVPTXISD::TexUnified2DArrayFloatFloat:
278   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
279   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
280   case NVPTXISD::TexUnified2DArrayS32S32:
281   case NVPTXISD::TexUnified2DArrayS32Float:
282   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
283   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
284   case NVPTXISD::TexUnified2DArrayU32S32:
285   case NVPTXISD::TexUnified2DArrayU32Float:
286   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
287   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
288   case NVPTXISD::TexUnified3DFloatS32:
289   case NVPTXISD::TexUnified3DFloatFloat:
290   case NVPTXISD::TexUnified3DFloatFloatLevel:
291   case NVPTXISD::TexUnified3DFloatFloatGrad:
292   case NVPTXISD::TexUnified3DS32S32:
293   case NVPTXISD::TexUnified3DS32Float:
294   case NVPTXISD::TexUnified3DS32FloatLevel:
295   case NVPTXISD::TexUnified3DS32FloatGrad:
296   case NVPTXISD::TexUnified3DU32S32:
297   case NVPTXISD::TexUnified3DU32Float:
298   case NVPTXISD::TexUnified3DU32FloatLevel:
299   case NVPTXISD::TexUnified3DU32FloatGrad:
300   case NVPTXISD::TexUnifiedCubeFloatFloat:
301   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
302   case NVPTXISD::TexUnifiedCubeS32Float:
303   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
304   case NVPTXISD::TexUnifiedCubeU32Float:
305   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
306   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
307   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
308   case NVPTXISD::TexUnifiedCubeArrayS32Float:
309   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
310   case NVPTXISD::TexUnifiedCubeArrayU32Float:
311   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
312   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
313   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
314   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
315   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
316   case NVPTXISD::Tld4UnifiedR2DS64Float:
317   case NVPTXISD::Tld4UnifiedG2DS64Float:
318   case NVPTXISD::Tld4UnifiedB2DS64Float:
319   case NVPTXISD::Tld4UnifiedA2DS64Float:
320   case NVPTXISD::Tld4UnifiedR2DU64Float:
321   case NVPTXISD::Tld4UnifiedG2DU64Float:
322   case NVPTXISD::Tld4UnifiedB2DU64Float:
323   case NVPTXISD::Tld4UnifiedA2DU64Float:
324     if (tryTextureIntrinsic(N))
325       return;
326     break;
327   case NVPTXISD::Suld1DI8Clamp:
328   case NVPTXISD::Suld1DI16Clamp:
329   case NVPTXISD::Suld1DI32Clamp:
330   case NVPTXISD::Suld1DI64Clamp:
331   case NVPTXISD::Suld1DV2I8Clamp:
332   case NVPTXISD::Suld1DV2I16Clamp:
333   case NVPTXISD::Suld1DV2I32Clamp:
334   case NVPTXISD::Suld1DV2I64Clamp:
335   case NVPTXISD::Suld1DV4I8Clamp:
336   case NVPTXISD::Suld1DV4I16Clamp:
337   case NVPTXISD::Suld1DV4I32Clamp:
338   case NVPTXISD::Suld1DArrayI8Clamp:
339   case NVPTXISD::Suld1DArrayI16Clamp:
340   case NVPTXISD::Suld1DArrayI32Clamp:
341   case NVPTXISD::Suld1DArrayI64Clamp:
342   case NVPTXISD::Suld1DArrayV2I8Clamp:
343   case NVPTXISD::Suld1DArrayV2I16Clamp:
344   case NVPTXISD::Suld1DArrayV2I32Clamp:
345   case NVPTXISD::Suld1DArrayV2I64Clamp:
346   case NVPTXISD::Suld1DArrayV4I8Clamp:
347   case NVPTXISD::Suld1DArrayV4I16Clamp:
348   case NVPTXISD::Suld1DArrayV4I32Clamp:
349   case NVPTXISD::Suld2DI8Clamp:
350   case NVPTXISD::Suld2DI16Clamp:
351   case NVPTXISD::Suld2DI32Clamp:
352   case NVPTXISD::Suld2DI64Clamp:
353   case NVPTXISD::Suld2DV2I8Clamp:
354   case NVPTXISD::Suld2DV2I16Clamp:
355   case NVPTXISD::Suld2DV2I32Clamp:
356   case NVPTXISD::Suld2DV2I64Clamp:
357   case NVPTXISD::Suld2DV4I8Clamp:
358   case NVPTXISD::Suld2DV4I16Clamp:
359   case NVPTXISD::Suld2DV4I32Clamp:
360   case NVPTXISD::Suld2DArrayI8Clamp:
361   case NVPTXISD::Suld2DArrayI16Clamp:
362   case NVPTXISD::Suld2DArrayI32Clamp:
363   case NVPTXISD::Suld2DArrayI64Clamp:
364   case NVPTXISD::Suld2DArrayV2I8Clamp:
365   case NVPTXISD::Suld2DArrayV2I16Clamp:
366   case NVPTXISD::Suld2DArrayV2I32Clamp:
367   case NVPTXISD::Suld2DArrayV2I64Clamp:
368   case NVPTXISD::Suld2DArrayV4I8Clamp:
369   case NVPTXISD::Suld2DArrayV4I16Clamp:
370   case NVPTXISD::Suld2DArrayV4I32Clamp:
371   case NVPTXISD::Suld3DI8Clamp:
372   case NVPTXISD::Suld3DI16Clamp:
373   case NVPTXISD::Suld3DI32Clamp:
374   case NVPTXISD::Suld3DI64Clamp:
375   case NVPTXISD::Suld3DV2I8Clamp:
376   case NVPTXISD::Suld3DV2I16Clamp:
377   case NVPTXISD::Suld3DV2I32Clamp:
378   case NVPTXISD::Suld3DV2I64Clamp:
379   case NVPTXISD::Suld3DV4I8Clamp:
380   case NVPTXISD::Suld3DV4I16Clamp:
381   case NVPTXISD::Suld3DV4I32Clamp:
382   case NVPTXISD::Suld1DI8Trap:
383   case NVPTXISD::Suld1DI16Trap:
384   case NVPTXISD::Suld1DI32Trap:
385   case NVPTXISD::Suld1DI64Trap:
386   case NVPTXISD::Suld1DV2I8Trap:
387   case NVPTXISD::Suld1DV2I16Trap:
388   case NVPTXISD::Suld1DV2I32Trap:
389   case NVPTXISD::Suld1DV2I64Trap:
390   case NVPTXISD::Suld1DV4I8Trap:
391   case NVPTXISD::Suld1DV4I16Trap:
392   case NVPTXISD::Suld1DV4I32Trap:
393   case NVPTXISD::Suld1DArrayI8Trap:
394   case NVPTXISD::Suld1DArrayI16Trap:
395   case NVPTXISD::Suld1DArrayI32Trap:
396   case NVPTXISD::Suld1DArrayI64Trap:
397   case NVPTXISD::Suld1DArrayV2I8Trap:
398   case NVPTXISD::Suld1DArrayV2I16Trap:
399   case NVPTXISD::Suld1DArrayV2I32Trap:
400   case NVPTXISD::Suld1DArrayV2I64Trap:
401   case NVPTXISD::Suld1DArrayV4I8Trap:
402   case NVPTXISD::Suld1DArrayV4I16Trap:
403   case NVPTXISD::Suld1DArrayV4I32Trap:
404   case NVPTXISD::Suld2DI8Trap:
405   case NVPTXISD::Suld2DI16Trap:
406   case NVPTXISD::Suld2DI32Trap:
407   case NVPTXISD::Suld2DI64Trap:
408   case NVPTXISD::Suld2DV2I8Trap:
409   case NVPTXISD::Suld2DV2I16Trap:
410   case NVPTXISD::Suld2DV2I32Trap:
411   case NVPTXISD::Suld2DV2I64Trap:
412   case NVPTXISD::Suld2DV4I8Trap:
413   case NVPTXISD::Suld2DV4I16Trap:
414   case NVPTXISD::Suld2DV4I32Trap:
415   case NVPTXISD::Suld2DArrayI8Trap:
416   case NVPTXISD::Suld2DArrayI16Trap:
417   case NVPTXISD::Suld2DArrayI32Trap:
418   case NVPTXISD::Suld2DArrayI64Trap:
419   case NVPTXISD::Suld2DArrayV2I8Trap:
420   case NVPTXISD::Suld2DArrayV2I16Trap:
421   case NVPTXISD::Suld2DArrayV2I32Trap:
422   case NVPTXISD::Suld2DArrayV2I64Trap:
423   case NVPTXISD::Suld2DArrayV4I8Trap:
424   case NVPTXISD::Suld2DArrayV4I16Trap:
425   case NVPTXISD::Suld2DArrayV4I32Trap:
426   case NVPTXISD::Suld3DI8Trap:
427   case NVPTXISD::Suld3DI16Trap:
428   case NVPTXISD::Suld3DI32Trap:
429   case NVPTXISD::Suld3DI64Trap:
430   case NVPTXISD::Suld3DV2I8Trap:
431   case NVPTXISD::Suld3DV2I16Trap:
432   case NVPTXISD::Suld3DV2I32Trap:
433   case NVPTXISD::Suld3DV2I64Trap:
434   case NVPTXISD::Suld3DV4I8Trap:
435   case NVPTXISD::Suld3DV4I16Trap:
436   case NVPTXISD::Suld3DV4I32Trap:
437   case NVPTXISD::Suld1DI8Zero:
438   case NVPTXISD::Suld1DI16Zero:
439   case NVPTXISD::Suld1DI32Zero:
440   case NVPTXISD::Suld1DI64Zero:
441   case NVPTXISD::Suld1DV2I8Zero:
442   case NVPTXISD::Suld1DV2I16Zero:
443   case NVPTXISD::Suld1DV2I32Zero:
444   case NVPTXISD::Suld1DV2I64Zero:
445   case NVPTXISD::Suld1DV4I8Zero:
446   case NVPTXISD::Suld1DV4I16Zero:
447   case NVPTXISD::Suld1DV4I32Zero:
448   case NVPTXISD::Suld1DArrayI8Zero:
449   case NVPTXISD::Suld1DArrayI16Zero:
450   case NVPTXISD::Suld1DArrayI32Zero:
451   case NVPTXISD::Suld1DArrayI64Zero:
452   case NVPTXISD::Suld1DArrayV2I8Zero:
453   case NVPTXISD::Suld1DArrayV2I16Zero:
454   case NVPTXISD::Suld1DArrayV2I32Zero:
455   case NVPTXISD::Suld1DArrayV2I64Zero:
456   case NVPTXISD::Suld1DArrayV4I8Zero:
457   case NVPTXISD::Suld1DArrayV4I16Zero:
458   case NVPTXISD::Suld1DArrayV4I32Zero:
459   case NVPTXISD::Suld2DI8Zero:
460   case NVPTXISD::Suld2DI16Zero:
461   case NVPTXISD::Suld2DI32Zero:
462   case NVPTXISD::Suld2DI64Zero:
463   case NVPTXISD::Suld2DV2I8Zero:
464   case NVPTXISD::Suld2DV2I16Zero:
465   case NVPTXISD::Suld2DV2I32Zero:
466   case NVPTXISD::Suld2DV2I64Zero:
467   case NVPTXISD::Suld2DV4I8Zero:
468   case NVPTXISD::Suld2DV4I16Zero:
469   case NVPTXISD::Suld2DV4I32Zero:
470   case NVPTXISD::Suld2DArrayI8Zero:
471   case NVPTXISD::Suld2DArrayI16Zero:
472   case NVPTXISD::Suld2DArrayI32Zero:
473   case NVPTXISD::Suld2DArrayI64Zero:
474   case NVPTXISD::Suld2DArrayV2I8Zero:
475   case NVPTXISD::Suld2DArrayV2I16Zero:
476   case NVPTXISD::Suld2DArrayV2I32Zero:
477   case NVPTXISD::Suld2DArrayV2I64Zero:
478   case NVPTXISD::Suld2DArrayV4I8Zero:
479   case NVPTXISD::Suld2DArrayV4I16Zero:
480   case NVPTXISD::Suld2DArrayV4I32Zero:
481   case NVPTXISD::Suld3DI8Zero:
482   case NVPTXISD::Suld3DI16Zero:
483   case NVPTXISD::Suld3DI32Zero:
484   case NVPTXISD::Suld3DI64Zero:
485   case NVPTXISD::Suld3DV2I8Zero:
486   case NVPTXISD::Suld3DV2I16Zero:
487   case NVPTXISD::Suld3DV2I32Zero:
488   case NVPTXISD::Suld3DV2I64Zero:
489   case NVPTXISD::Suld3DV4I8Zero:
490   case NVPTXISD::Suld3DV4I16Zero:
491   case NVPTXISD::Suld3DV4I32Zero:
492     if (trySurfaceIntrinsic(N))
493       return;
494     break;
495   case ISD::AND:
496   case ISD::SRA:
497   case ISD::SRL:
498     // Try to select BFE
499     if (tryBFE(N))
500       return;
501     break;
502   case ISD::ADDRSPACECAST:
503     SelectAddrSpaceCast(N);
504     return;
505   case ISD::ConstantFP:
506     if (tryConstantFP(N))
507       return;
508     break;
509   default:
510     break;
511   }
512   SelectCode(N);
513 }
514 
515 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
516   unsigned IID = N->getConstantOperandVal(1);
517   switch (IID) {
518   default:
519     return false;
520   case Intrinsic::nvvm_ldg_global_f:
521   case Intrinsic::nvvm_ldg_global_i:
522   case Intrinsic::nvvm_ldg_global_p:
523   case Intrinsic::nvvm_ldu_global_f:
524   case Intrinsic::nvvm_ldu_global_i:
525   case Intrinsic::nvvm_ldu_global_p:
526     return tryLDGLDU(N);
527   }
528 }
529 
530 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
531 // have to load them into an .(b)f16 register first.
532 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
533   if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
534     return false;
535   SDValue Val = CurDAG->getTargetConstantFP(
536       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
537   SDNode *LoadConstF16 = CurDAG->getMachineNode(
538       (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
539                                       : NVPTX::LOAD_CONST_BF16),
540       SDLoc(N), N->getValueType(0), Val);
541   ReplaceNode(N, LoadConstF16);
542   return true;
543 }
544 
545 // Map ISD:CONDCODE value to appropriate CmpMode expected by
546 // NVPTXInstPrinter::printCmpMode()
547 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
548   using NVPTX::PTXCmpMode::CmpMode;
549   unsigned PTXCmpMode = [](ISD::CondCode CC) {
550     switch (CC) {
551     default:
552       llvm_unreachable("Unexpected condition code.");
553     case ISD::SETOEQ:
554       return CmpMode::EQ;
555     case ISD::SETOGT:
556       return CmpMode::GT;
557     case ISD::SETOGE:
558       return CmpMode::GE;
559     case ISD::SETOLT:
560       return CmpMode::LT;
561     case ISD::SETOLE:
562       return CmpMode::LE;
563     case ISD::SETONE:
564       return CmpMode::NE;
565     case ISD::SETO:
566       return CmpMode::NUM;
567     case ISD::SETUO:
568       return CmpMode::NotANumber;
569     case ISD::SETUEQ:
570       return CmpMode::EQU;
571     case ISD::SETUGT:
572       return CmpMode::GTU;
573     case ISD::SETUGE:
574       return CmpMode::GEU;
575     case ISD::SETULT:
576       return CmpMode::LTU;
577     case ISD::SETULE:
578       return CmpMode::LEU;
579     case ISD::SETUNE:
580       return CmpMode::NEU;
581     case ISD::SETEQ:
582       return CmpMode::EQ;
583     case ISD::SETGT:
584       return CmpMode::GT;
585     case ISD::SETGE:
586       return CmpMode::GE;
587     case ISD::SETLT:
588       return CmpMode::LT;
589     case ISD::SETLE:
590       return CmpMode::LE;
591     case ISD::SETNE:
592       return CmpMode::NE;
593     }
594   }(CondCode.get());
595 
596   if (FTZ)
597     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
598 
599   return PTXCmpMode;
600 }
601 
602 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
603   unsigned PTXCmpMode =
604       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
605   SDLoc DL(N);
606   SDNode *SetP = CurDAG->getMachineNode(
607       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
608       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
609   ReplaceNode(N, SetP);
610   return true;
611 }
612 
613 bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
614   unsigned PTXCmpMode =
615       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
616   SDLoc DL(N);
617   SDNode *SetP = CurDAG->getMachineNode(
618       NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
619       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
620   ReplaceNode(N, SetP);
621   return true;
622 }
623 
624 // Find all instances of extract_vector_elt that use this v2f16 vector
625 // and coalesce them into a scattering move instruction.
626 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
627   SDValue Vector = N->getOperand(0);
628 
629   // We only care about 16x2 as it's the only real vector type we
630   // need to deal with.
631   MVT VT = Vector.getSimpleValueType();
632   if (!Isv2x16VT(VT))
633     return false;
634   // Find and record all uses of this vector that extract element 0 or 1.
635   SmallVector<SDNode *, 4> E0, E1;
636   for (auto *U : Vector.getNode()->uses()) {
637     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
638       continue;
639     if (U->getOperand(0) != Vector)
640       continue;
641     if (const ConstantSDNode *IdxConst =
642             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
643       if (IdxConst->getZExtValue() == 0)
644         E0.push_back(U);
645       else if (IdxConst->getZExtValue() == 1)
646         E1.push_back(U);
647       else
648         llvm_unreachable("Invalid vector index.");
649     }
650   }
651 
652   // There's no point scattering f16x2 if we only ever access one
653   // element of it.
654   if (E0.empty() || E1.empty())
655     return false;
656 
657   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
658   // into f16,f16 SplitF16x2(V)
659   MVT EltVT = VT.getVectorElementType();
660   SDNode *ScatterOp =
661       CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
662   for (auto *Node : E0)
663     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
664   for (auto *Node : E1)
665     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
666 
667   return true;
668 }
669 
670 static unsigned int getCodeAddrSpace(MemSDNode *N) {
671   const Value *Src = N->getMemOperand()->getValue();
672 
673   if (!Src)
674     return NVPTX::PTXLdStInstCode::GENERIC;
675 
676   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
677     switch (PT->getAddressSpace()) {
678     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
679     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
680     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
681     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
682     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
683     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
684     default: break;
685     }
686   }
687   return NVPTX::PTXLdStInstCode::GENERIC;
688 }
689 
690 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
691                           unsigned CodeAddrSpace, MachineFunction *F) {
692   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
693   // space.
694   //
695   // We have two ways of identifying invariant loads: Loads may be explicitly
696   // marked as invariant, or we may infer them to be invariant.
697   //
698   // We currently infer invariance for loads from
699   //  - constant global variables, and
700   //  - kernel function pointer params that are noalias (i.e. __restrict) and
701   //    never written to.
702   //
703   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
704   // not during the SelectionDAG phase).
705   //
706   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
707   // explicitly invariant loads because these are how clang tells us to use ldg
708   // when the user uses a builtin.
709   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
710     return false;
711 
712   if (N->isInvariant())
713     return true;
714 
715   bool IsKernelFn = isKernelFunction(F->getFunction());
716 
717   // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
718   // because the former looks through phi nodes while the latter does not. We
719   // need to look through phi nodes to handle pointer induction variables.
720   SmallVector<const Value *, 8> Objs;
721   getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
722 
723   return all_of(Objs, [&](const Value *V) {
724     if (auto *A = dyn_cast<const Argument>(V))
725       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
726     if (auto *GV = dyn_cast<const GlobalVariable>(V))
727       return GV->isConstant();
728     return false;
729   });
730 }
731 
732 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
733   unsigned IID = N->getConstantOperandVal(0);
734   switch (IID) {
735   default:
736     return false;
737   case Intrinsic::nvvm_texsurf_handle_internal:
738     SelectTexSurfHandle(N);
739     return true;
740   }
741 }
742 
743 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
744   // Op 0 is the intrinsic ID
745   SDValue Wrapper = N->getOperand(1);
746   SDValue GlobalVal = Wrapper.getOperand(0);
747   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
748                                         MVT::i64, GlobalVal));
749 }
750 
751 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
752   SDValue Src = N->getOperand(0);
753   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
754   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
755   unsigned DstAddrSpace = CastN->getDestAddressSpace();
756   assert(SrcAddrSpace != DstAddrSpace &&
757          "addrspacecast must be between different address spaces");
758 
759   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
760     // Specific to generic
761     unsigned Opc;
762     switch (SrcAddrSpace) {
763     default: report_fatal_error("Bad address space in addrspacecast");
764     case ADDRESS_SPACE_GLOBAL:
765       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
766       break;
767     case ADDRESS_SPACE_SHARED:
768       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
769                                                : NVPTX::cvta_shared_yes_64)
770                          : NVPTX::cvta_shared_yes;
771       break;
772     case ADDRESS_SPACE_CONST:
773       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
774                                                : NVPTX::cvta_const_yes_64)
775                          : NVPTX::cvta_const_yes;
776       break;
777     case ADDRESS_SPACE_LOCAL:
778       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
779                                                : NVPTX::cvta_local_yes_64)
780                          : NVPTX::cvta_local_yes;
781       break;
782     }
783     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
784                                           Src));
785     return;
786   } else {
787     // Generic to specific
788     if (SrcAddrSpace != 0)
789       report_fatal_error("Cannot cast between two non-generic address spaces");
790     unsigned Opc;
791     switch (DstAddrSpace) {
792     default: report_fatal_error("Bad address space in addrspacecast");
793     case ADDRESS_SPACE_GLOBAL:
794       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
795                          : NVPTX::cvta_to_global_yes;
796       break;
797     case ADDRESS_SPACE_SHARED:
798       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
799                                                 : NVPTX::cvta_to_shared_yes_64)
800                          : NVPTX::cvta_to_shared_yes;
801       break;
802     case ADDRESS_SPACE_CONST:
803       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
804                                              : NVPTX::cvta_to_const_yes_64)
805                          : NVPTX::cvta_to_const_yes;
806       break;
807     case ADDRESS_SPACE_LOCAL:
808       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
809                                                : NVPTX::cvta_to_local_yes_64)
810                          : NVPTX::cvta_to_local_yes;
811       break;
812     case ADDRESS_SPACE_PARAM:
813       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
814                          : NVPTX::nvvm_ptr_gen_to_param;
815       break;
816     }
817     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
818                                           Src));
819     return;
820   }
821 }
822 
823 // Helper function template to reduce amount of boilerplate code for
824 // opcode selection.
825 static std::optional<unsigned>
826 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
827                 unsigned Opcode_i16, unsigned Opcode_i32,
828                 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
829                 std::optional<unsigned> Opcode_f64) {
830   switch (VT) {
831   case MVT::i1:
832   case MVT::i8:
833     return Opcode_i8;
834   case MVT::i16:
835     return Opcode_i16;
836   case MVT::i32:
837     return Opcode_i32;
838   case MVT::i64:
839     return Opcode_i64;
840   case MVT::f16:
841   case MVT::bf16:
842     return Opcode_i16;
843   case MVT::v2f16:
844   case MVT::v2bf16:
845   case MVT::v2i16:
846   case MVT::v4i8:
847     return Opcode_i32;
848   case MVT::f32:
849     return Opcode_f32;
850   case MVT::f64:
851     return Opcode_f64;
852   default:
853     return std::nullopt;
854   }
855 }
856 
857 static int getLdStRegType(EVT VT) {
858   if (VT.isFloatingPoint())
859     switch (VT.getSimpleVT().SimpleTy) {
860     case MVT::f16:
861     case MVT::bf16:
862     case MVT::v2f16:
863     case MVT::v2bf16:
864       return NVPTX::PTXLdStInstCode::Untyped;
865     default:
866       return NVPTX::PTXLdStInstCode::Float;
867     }
868   else
869     return NVPTX::PTXLdStInstCode::Unsigned;
870 }
871 
872 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
873   SDLoc dl(N);
874   MemSDNode *LD = cast<MemSDNode>(N);
875   assert(LD->readMem() && "Expected load");
876   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
877   EVT LoadedVT = LD->getMemoryVT();
878   SDNode *NVPTXLD = nullptr;
879 
880   // do not support pre/post inc/dec
881   if (PlainLoad && PlainLoad->isIndexed())
882     return false;
883 
884   if (!LoadedVT.isSimple())
885     return false;
886 
887   AtomicOrdering Ordering = LD->getSuccessOrdering();
888   // In order to lower atomic loads with stronger guarantees we would need to
889   // use load.acquire or insert fences. However these features were only added
890   // with PTX ISA 6.0 / sm_70.
891   // TODO: Check if we can actually use the new instructions and implement them.
892   if (isStrongerThanMonotonic(Ordering))
893     return false;
894 
895   // Address Space Setting
896   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
897   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
898     return tryLDGLDU(N);
899   }
900 
901   unsigned int PointerSize =
902       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
903 
904   // Volatile Setting
905   // - .volatile is only available for .global and .shared
906   // - .volatile has the same memory synchronization semantics as .relaxed.sys
907   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
908   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
909       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
910       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
911     isVolatile = false;
912 
913   // Type Setting: fromType + fromTypeWidth
914   //
915   // Sign   : ISD::SEXTLOAD
916   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
917   //          type is integer
918   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
919   MVT SimpleVT = LoadedVT.getSimpleVT();
920   MVT ScalarVT = SimpleVT.getScalarType();
921   // Read at least 8 bits (predicates are stored as 8-bit values)
922   unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
923   unsigned int fromType;
924 
925   // Vector Setting
926   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
927   if (SimpleVT.isVector()) {
928     assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
929            "Unexpected vector type");
930     // v2f16/v2bf16/v2i16 is loaded using ld.b32
931     fromTypeWidth = 32;
932   }
933 
934   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
935     fromType = NVPTX::PTXLdStInstCode::Signed;
936   else
937     fromType = getLdStRegType(ScalarVT);
938 
939   // Create the machine instruction DAG
940   SDValue Chain = N->getOperand(0);
941   SDValue N1 = N->getOperand(1);
942   SDValue Addr;
943   SDValue Offset, Base;
944   std::optional<unsigned> Opcode;
945   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
946 
947   if (SelectDirectAddr(N1, Addr)) {
948     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
949                              NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
950                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
951     if (!Opcode)
952       return false;
953     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
954                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
955                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
956     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
957   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
958                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
959     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
960                              NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
961                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
962     if (!Opcode)
963       return false;
964     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
965                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
966                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
967     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
968   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
969                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
970     if (PointerSize == 64)
971       Opcode =
972           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
973                           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
974                           NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
975     else
976       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
977                                NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
978                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
979     if (!Opcode)
980       return false;
981     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
982                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
983                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
984     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
985   } else {
986     if (PointerSize == 64)
987       Opcode =
988           pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
989                           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
990                           NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
991     else
992       Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
993                                NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
994                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
995     if (!Opcode)
996       return false;
997     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
998                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
999                       getI32Imm(fromTypeWidth, dl), N1, Chain };
1000     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1001   }
1002 
1003   if (!NVPTXLD)
1004     return false;
1005 
1006   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1007   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1008 
1009   ReplaceNode(N, NVPTXLD);
1010   return true;
1011 }
1012 
1013 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1014 
1015   SDValue Chain = N->getOperand(0);
1016   SDValue Op1 = N->getOperand(1);
1017   SDValue Addr, Offset, Base;
1018   std::optional<unsigned> Opcode;
1019   SDLoc DL(N);
1020   SDNode *LD;
1021   MemSDNode *MemSD = cast<MemSDNode>(N);
1022   EVT LoadedVT = MemSD->getMemoryVT();
1023 
1024   if (!LoadedVT.isSimple())
1025     return false;
1026 
1027   // Address Space Setting
1028   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1029   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1030     return tryLDGLDU(N);
1031   }
1032 
1033   unsigned int PointerSize =
1034       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1035 
1036   // Volatile Setting
1037   // - .volatile is only availalble for .global and .shared
1038   bool IsVolatile = MemSD->isVolatile();
1039   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1040       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1041       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1042     IsVolatile = false;
1043 
1044   // Vector Setting
1045   MVT SimpleVT = LoadedVT.getSimpleVT();
1046 
1047   // Type Setting: fromType + fromTypeWidth
1048   //
1049   // Sign   : ISD::SEXTLOAD
1050   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1051   //          type is integer
1052   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1053   MVT ScalarVT = SimpleVT.getScalarType();
1054   // Read at least 8 bits (predicates are stored as 8-bit values)
1055   unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1056   unsigned int FromType;
1057   // The last operand holds the original LoadSDNode::getExtensionType() value
1058   unsigned ExtensionType = cast<ConstantSDNode>(
1059       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1060   if (ExtensionType == ISD::SEXTLOAD)
1061     FromType = NVPTX::PTXLdStInstCode::Signed;
1062   else
1063     FromType = getLdStRegType(ScalarVT);
1064 
1065   unsigned VecType;
1066 
1067   switch (N->getOpcode()) {
1068   case NVPTXISD::LoadV2:
1069     VecType = NVPTX::PTXLdStInstCode::V2;
1070     break;
1071   case NVPTXISD::LoadV4:
1072     VecType = NVPTX::PTXLdStInstCode::V4;
1073     break;
1074   default:
1075     return false;
1076   }
1077 
1078   EVT EltVT = N->getValueType(0);
1079 
1080   // v8x16 is a special case. PTX doesn't have ld.v8.16
1081   // instruction. Instead, we split the vector into v2x16 chunks and
1082   // load them with ld.v4.b32.
1083   if (Isv2x16VT(EltVT)) {
1084     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1085     EltVT = MVT::i32;
1086     FromType = NVPTX::PTXLdStInstCode::Untyped;
1087     FromTypeWidth = 32;
1088   }
1089 
1090   if (SelectDirectAddr(Op1, Addr)) {
1091     switch (N->getOpcode()) {
1092     default:
1093       return false;
1094     case NVPTXISD::LoadV2:
1095       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1096                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1097                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1098                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1099       break;
1100     case NVPTXISD::LoadV4:
1101       Opcode =
1102           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1103                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1104                           std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1105       break;
1106     }
1107     if (!Opcode)
1108       return false;
1109     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1110                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1111                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1112     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1113   } else if (PointerSize == 64
1114                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1115                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1116     switch (N->getOpcode()) {
1117     default:
1118       return false;
1119     case NVPTXISD::LoadV2:
1120       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1121                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1122                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1123                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1124       break;
1125     case NVPTXISD::LoadV4:
1126       Opcode =
1127           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1128                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1129                           std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1130       break;
1131     }
1132     if (!Opcode)
1133       return false;
1134     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1135                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1136                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1137     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1138   } else if (PointerSize == 64
1139                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1140                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1141     if (PointerSize == 64) {
1142       switch (N->getOpcode()) {
1143       default:
1144         return false;
1145       case NVPTXISD::LoadV2:
1146         Opcode =
1147             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1148                             NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1149                             NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1150                             NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1151         break;
1152       case NVPTXISD::LoadV4:
1153         Opcode = pickOpcodeForVT(
1154             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1155             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1156             NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1157         break;
1158       }
1159     } else {
1160       switch (N->getOpcode()) {
1161       default:
1162         return false;
1163       case NVPTXISD::LoadV2:
1164         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1165                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1166                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1167                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1168         break;
1169       case NVPTXISD::LoadV4:
1170         Opcode =
1171             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1172                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1173                             std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1174         break;
1175       }
1176     }
1177     if (!Opcode)
1178       return false;
1179     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1180                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1181                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1182 
1183     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1184   } else {
1185     if (PointerSize == 64) {
1186       switch (N->getOpcode()) {
1187       default:
1188         return false;
1189       case NVPTXISD::LoadV2:
1190         Opcode = pickOpcodeForVT(
1191             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1192             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1193             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1194             NVPTX::LDV_f64_v2_areg_64);
1195         break;
1196       case NVPTXISD::LoadV4:
1197         Opcode = pickOpcodeForVT(
1198             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1199             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1200             NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1201         break;
1202       }
1203     } else {
1204       switch (N->getOpcode()) {
1205       default:
1206         return false;
1207       case NVPTXISD::LoadV2:
1208         Opcode =
1209             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1210                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1211                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1212                             NVPTX::LDV_f64_v2_areg);
1213         break;
1214       case NVPTXISD::LoadV4:
1215         Opcode =
1216             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1217                             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1218                             std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1219         break;
1220       }
1221     }
1222     if (!Opcode)
1223       return false;
1224     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1225                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1226                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1227     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1228   }
1229 
1230   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1231   CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1232 
1233   ReplaceNode(N, LD);
1234   return true;
1235 }
1236 
1237 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1238 
1239   SDValue Chain = N->getOperand(0);
1240   SDValue Op1;
1241   MemSDNode *Mem;
1242   bool IsLDG = true;
1243 
1244   // If this is an LDG intrinsic, the address is the third operand. If its an
1245   // LDG/LDU SD node (from custom vector handling), then its the second operand
1246   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1247     Op1 = N->getOperand(2);
1248     Mem = cast<MemIntrinsicSDNode>(N);
1249     unsigned IID = N->getConstantOperandVal(1);
1250     switch (IID) {
1251     default:
1252       return false;
1253     case Intrinsic::nvvm_ldg_global_f:
1254     case Intrinsic::nvvm_ldg_global_i:
1255     case Intrinsic::nvvm_ldg_global_p:
1256       IsLDG = true;
1257       break;
1258     case Intrinsic::nvvm_ldu_global_f:
1259     case Intrinsic::nvvm_ldu_global_i:
1260     case Intrinsic::nvvm_ldu_global_p:
1261       IsLDG = false;
1262       break;
1263     }
1264   } else {
1265     Op1 = N->getOperand(1);
1266     Mem = cast<MemSDNode>(N);
1267   }
1268 
1269   std::optional<unsigned> Opcode;
1270   SDLoc DL(N);
1271   SDNode *LD;
1272   SDValue Base, Offset, Addr;
1273   EVT OrigType = N->getValueType(0);
1274 
1275   EVT EltVT = Mem->getMemoryVT();
1276   unsigned NumElts = 1;
1277   if (EltVT.isVector()) {
1278     NumElts = EltVT.getVectorNumElements();
1279     EltVT = EltVT.getVectorElementType();
1280     // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1281     if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1282         (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1283         (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1284       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1285       EltVT = OrigType;
1286       NumElts /= 2;
1287     } else if (OrigType == MVT::v4i8) {
1288       EltVT = OrigType;
1289       NumElts = 1;
1290     }
1291   }
1292 
1293   // Build the "promoted" result VTList for the load. If we are really loading
1294   // i8s, then the return type will be promoted to i16 since we do not expose
1295   // 8-bit registers in NVPTX.
1296   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1297   SmallVector<EVT, 5> InstVTs;
1298   for (unsigned i = 0; i != NumElts; ++i) {
1299     InstVTs.push_back(NodeVT);
1300   }
1301   InstVTs.push_back(MVT::Other);
1302   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1303 
1304   if (SelectDirectAddr(Op1, Addr)) {
1305     switch (N->getOpcode()) {
1306     default:
1307       return false;
1308     case ISD::LOAD:
1309     case ISD::INTRINSIC_W_CHAIN:
1310       if (IsLDG)
1311         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1312                                  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1313                                  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1314                                  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1315                                  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1316                                  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1317                                  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1318       else
1319         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1320                                  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1321                                  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1322                                  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1323                                  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1324                                  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1325                                  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1326       break;
1327     case NVPTXISD::LoadV2:
1328     case NVPTXISD::LDGV2:
1329       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1330                                NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1331                                NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1332                                NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1333                                NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1334                                NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1335                                NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1336       break;
1337     case NVPTXISD::LDUV2:
1338       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1339                                NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1340                                NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1341                                NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1342                                NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1343                                NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1344                                NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1345       break;
1346     case NVPTXISD::LoadV4:
1347     case NVPTXISD::LDGV4:
1348       Opcode = pickOpcodeForVT(
1349           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1350           NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1351           NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1352           NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1353       break;
1354     case NVPTXISD::LDUV4:
1355       Opcode = pickOpcodeForVT(
1356           EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1357           NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1358           NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1359           NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1360       break;
1361     }
1362     if (!Opcode)
1363       return false;
1364     SDValue Ops[] = { Addr, Chain };
1365     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1366   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1367                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1368     if (TM.is64Bit()) {
1369       switch (N->getOpcode()) {
1370       default:
1371         return false;
1372       case ISD::LOAD:
1373       case ISD::INTRINSIC_W_CHAIN:
1374         if (IsLDG)
1375           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1376                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1377                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1378                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1379                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1380                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1381                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1382         else
1383           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1384                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1385                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1386                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1387                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1388                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1389                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1390         break;
1391       case NVPTXISD::LoadV2:
1392       case NVPTXISD::LDGV2:
1393         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1394                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1395                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1396                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1397                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1398                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1399                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1400         break;
1401       case NVPTXISD::LDUV2:
1402         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1403                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1404                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1405                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1406                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1407                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1408                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1409         break;
1410       case NVPTXISD::LoadV4:
1411       case NVPTXISD::LDGV4:
1412         Opcode = pickOpcodeForVT(
1413             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1414             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1415             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1416             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1417         break;
1418       case NVPTXISD::LDUV4:
1419         Opcode = pickOpcodeForVT(
1420             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1421             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1422             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1423             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1424         break;
1425       }
1426     } else {
1427       switch (N->getOpcode()) {
1428       default:
1429         return false;
1430       case ISD::LOAD:
1431       case ISD::INTRINSIC_W_CHAIN:
1432         if (IsLDG)
1433           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1434                                    NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1435                                    NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1436                                    NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1437                                    NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1438                                    NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1439                                    NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1440         else
1441           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1442                                    NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1443                                    NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1444                                    NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1445                                    NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1446                                    NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1447                                    NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1448         break;
1449       case NVPTXISD::LoadV2:
1450       case NVPTXISD::LDGV2:
1451         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1452                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1453                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1454                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1455                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1456                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1457                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1458         break;
1459       case NVPTXISD::LDUV2:
1460         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1461                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1462                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1463                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1464                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1465                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1466                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1467         break;
1468       case NVPTXISD::LoadV4:
1469       case NVPTXISD::LDGV4:
1470         Opcode = pickOpcodeForVT(
1471             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1472             NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1473             NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1474             NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1475         break;
1476       case NVPTXISD::LDUV4:
1477         Opcode = pickOpcodeForVT(
1478             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1479             NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1480             NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1481             NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1482         break;
1483       }
1484     }
1485     if (!Opcode)
1486       return false;
1487     SDValue Ops[] = {Base, Offset, Chain};
1488     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1489   } else {
1490     if (TM.is64Bit()) {
1491       switch (N->getOpcode()) {
1492       default:
1493         return false;
1494       case ISD::LOAD:
1495       case ISD::INTRINSIC_W_CHAIN:
1496         if (IsLDG)
1497           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1498                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1499                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1500                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1501                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1502                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1503                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1504         else
1505           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1506                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1507                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1508                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1509                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1510                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1511                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1512         break;
1513       case NVPTXISD::LoadV2:
1514       case NVPTXISD::LDGV2:
1515         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1516                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1517                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1518                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1519                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1520                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1521                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1522         break;
1523       case NVPTXISD::LDUV2:
1524         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1525                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1526                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1527                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1528                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1529                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1530                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1531         break;
1532       case NVPTXISD::LoadV4:
1533       case NVPTXISD::LDGV4:
1534         Opcode = pickOpcodeForVT(
1535             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1536             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1537             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1538             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1539         break;
1540       case NVPTXISD::LDUV4:
1541         Opcode = pickOpcodeForVT(
1542             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1543             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1544             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1545             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1546         break;
1547       }
1548     } else {
1549       switch (N->getOpcode()) {
1550       default:
1551         return false;
1552       case ISD::LOAD:
1553       case ISD::INTRINSIC_W_CHAIN:
1554         if (IsLDG)
1555           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1556                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1557                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1558                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1559                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1560                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1561                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1562         else
1563           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1564                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1565                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1566                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1567                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1568                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1569                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1570         break;
1571       case NVPTXISD::LoadV2:
1572       case NVPTXISD::LDGV2:
1573         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1574                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1575                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1576                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1577                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1578                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1579                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1580         break;
1581       case NVPTXISD::LDUV2:
1582         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1583                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1584                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1585                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1586                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1587                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1588                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1589         break;
1590       case NVPTXISD::LoadV4:
1591       case NVPTXISD::LDGV4:
1592         Opcode = pickOpcodeForVT(
1593             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1594             NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1595             NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1596             NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1597         break;
1598       case NVPTXISD::LDUV4:
1599         Opcode = pickOpcodeForVT(
1600             EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1601             NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1602             NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1603             NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1604         break;
1605       }
1606     }
1607     if (!Opcode)
1608       return false;
1609     SDValue Ops[] = { Op1, Chain };
1610     LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1611   }
1612 
1613   // For automatic generation of LDG (through SelectLoad[Vector], not the
1614   // intrinsics), we may have an extending load like:
1615   //
1616   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1617   //
1618   // In this case, the matching logic above will select a load for the original
1619   // memory type (in this case, i8) and our types will not match (the node needs
1620   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1621   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1622   // CVT instruction. Ptxas should clean up any redundancies here.
1623 
1624   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1625 
1626   if (OrigType != EltVT &&
1627       (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1628     // We have an extending-load. The instruction we selected operates on the
1629     // smaller type, but the SDNode we are replacing has the larger type. We
1630     // need to emit a CVT to make the types match.
1631     unsigned CvtOpc =
1632         GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1633 
1634     // For each output value, apply the manual sign/zero-extension and make sure
1635     // all users of the load go through that CVT.
1636     for (unsigned i = 0; i != NumElts; ++i) {
1637       SDValue Res(LD, i);
1638       SDValue OrigVal(N, i);
1639 
1640       SDNode *CvtNode =
1641         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1642                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1643                                                          DL, MVT::i32));
1644       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1645     }
1646   }
1647 
1648   ReplaceNode(N, LD);
1649   return true;
1650 }
1651 
1652 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1653   SDLoc dl(N);
1654   MemSDNode *ST = cast<MemSDNode>(N);
1655   assert(ST->writeMem() && "Expected store");
1656   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1657   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1658   assert((PlainStore || AtomicStore) && "Expected store");
1659   EVT StoreVT = ST->getMemoryVT();
1660   SDNode *NVPTXST = nullptr;
1661 
1662   // do not support pre/post inc/dec
1663   if (PlainStore && PlainStore->isIndexed())
1664     return false;
1665 
1666   if (!StoreVT.isSimple())
1667     return false;
1668 
1669   AtomicOrdering Ordering = ST->getSuccessOrdering();
1670   // In order to lower atomic loads with stronger guarantees we would need to
1671   // use store.release or insert fences. However these features were only added
1672   // with PTX ISA 6.0 / sm_70.
1673   // TODO: Check if we can actually use the new instructions and implement them.
1674   if (isStrongerThanMonotonic(Ordering))
1675     return false;
1676 
1677   // Address Space Setting
1678   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1679   unsigned int PointerSize =
1680       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1681 
1682   // Volatile Setting
1683   // - .volatile is only available for .global and .shared
1684   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1685   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1686   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1687       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1688       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1689     isVolatile = false;
1690 
1691   // Vector Setting
1692   MVT SimpleVT = StoreVT.getSimpleVT();
1693   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1694 
1695   // Type Setting: toType + toTypeWidth
1696   // - for integer type, always use 'u'
1697   //
1698   MVT ScalarVT = SimpleVT.getScalarType();
1699   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1700   if (SimpleVT.isVector()) {
1701     assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1702            "Unexpected vector type");
1703     // v2x16 is stored using st.b32
1704     toTypeWidth = 32;
1705   }
1706 
1707   unsigned int toType = getLdStRegType(ScalarVT);
1708 
1709   // Create the machine instruction DAG
1710   SDValue Chain = ST->getChain();
1711   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1712   SDValue BasePtr = ST->getBasePtr();
1713   SDValue Addr;
1714   SDValue Offset, Base;
1715   std::optional<unsigned> Opcode;
1716   MVT::SimpleValueType SourceVT =
1717       Value.getNode()->getSimpleValueType(0).SimpleTy;
1718 
1719   if (SelectDirectAddr(BasePtr, Addr)) {
1720     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1721                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1722                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1723     if (!Opcode)
1724       return false;
1725     SDValue Ops[] = {Value,
1726                      getI32Imm(isVolatile, dl),
1727                      getI32Imm(CodeAddrSpace, dl),
1728                      getI32Imm(vecType, dl),
1729                      getI32Imm(toType, dl),
1730                      getI32Imm(toTypeWidth, dl),
1731                      Addr,
1732                      Chain};
1733     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1734   } else if (PointerSize == 64
1735                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1736                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1737     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1738                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1739                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1740     if (!Opcode)
1741       return false;
1742     SDValue Ops[] = {Value,
1743                      getI32Imm(isVolatile, dl),
1744                      getI32Imm(CodeAddrSpace, dl),
1745                      getI32Imm(vecType, dl),
1746                      getI32Imm(toType, dl),
1747                      getI32Imm(toTypeWidth, dl),
1748                      Base,
1749                      Offset,
1750                      Chain};
1751     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1752   } else if (PointerSize == 64
1753                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1754                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1755     if (PointerSize == 64)
1756       Opcode =
1757           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1758                           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1759                           NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1760     else
1761       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1762                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1763                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1764     if (!Opcode)
1765       return false;
1766 
1767     SDValue Ops[] = {Value,
1768                      getI32Imm(isVolatile, dl),
1769                      getI32Imm(CodeAddrSpace, dl),
1770                      getI32Imm(vecType, dl),
1771                      getI32Imm(toType, dl),
1772                      getI32Imm(toTypeWidth, dl),
1773                      Base,
1774                      Offset,
1775                      Chain};
1776     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1777   } else {
1778     if (PointerSize == 64)
1779       Opcode =
1780           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1781                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1782                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1783     else
1784       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1785                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1786                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1787     if (!Opcode)
1788       return false;
1789     SDValue Ops[] = {Value,
1790                      getI32Imm(isVolatile, dl),
1791                      getI32Imm(CodeAddrSpace, dl),
1792                      getI32Imm(vecType, dl),
1793                      getI32Imm(toType, dl),
1794                      getI32Imm(toTypeWidth, dl),
1795                      BasePtr,
1796                      Chain};
1797     NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1798   }
1799 
1800   if (!NVPTXST)
1801     return false;
1802 
1803   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1804   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1805   ReplaceNode(N, NVPTXST);
1806   return true;
1807 }
1808 
1809 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1810   SDValue Chain = N->getOperand(0);
1811   SDValue Op1 = N->getOperand(1);
1812   SDValue Addr, Offset, Base;
1813   std::optional<unsigned> Opcode;
1814   SDLoc DL(N);
1815   SDNode *ST;
1816   EVT EltVT = Op1.getValueType();
1817   MemSDNode *MemSD = cast<MemSDNode>(N);
1818   EVT StoreVT = MemSD->getMemoryVT();
1819 
1820   // Address Space Setting
1821   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1822   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1823     report_fatal_error("Cannot store to pointer that points to constant "
1824                        "memory space");
1825   }
1826   unsigned int PointerSize =
1827       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1828 
1829   // Volatile Setting
1830   // - .volatile is only availalble for .global and .shared
1831   bool IsVolatile = MemSD->isVolatile();
1832   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1833       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1834       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1835     IsVolatile = false;
1836 
1837   // Type Setting: toType + toTypeWidth
1838   // - for integer type, always use 'u'
1839   assert(StoreVT.isSimple() && "Store value is not simple");
1840   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1841   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1842   unsigned ToType = getLdStRegType(ScalarVT);
1843 
1844   SmallVector<SDValue, 12> StOps;
1845   SDValue N2;
1846   unsigned VecType;
1847 
1848   switch (N->getOpcode()) {
1849   case NVPTXISD::StoreV2:
1850     VecType = NVPTX::PTXLdStInstCode::V2;
1851     StOps.push_back(N->getOperand(1));
1852     StOps.push_back(N->getOperand(2));
1853     N2 = N->getOperand(3);
1854     break;
1855   case NVPTXISD::StoreV4:
1856     VecType = NVPTX::PTXLdStInstCode::V4;
1857     StOps.push_back(N->getOperand(1));
1858     StOps.push_back(N->getOperand(2));
1859     StOps.push_back(N->getOperand(3));
1860     StOps.push_back(N->getOperand(4));
1861     N2 = N->getOperand(5);
1862     break;
1863   default:
1864     return false;
1865   }
1866 
1867   // v8x16 is a special case. PTX doesn't have st.v8.x16
1868   // instruction. Instead, we split the vector into v2x16 chunks and
1869   // store them with st.v4.b32.
1870   if (Isv2x16VT(EltVT)) {
1871     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1872     EltVT = MVT::i32;
1873     ToType = NVPTX::PTXLdStInstCode::Untyped;
1874     ToTypeWidth = 32;
1875   }
1876 
1877   StOps.push_back(getI32Imm(IsVolatile, DL));
1878   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1879   StOps.push_back(getI32Imm(VecType, DL));
1880   StOps.push_back(getI32Imm(ToType, DL));
1881   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1882 
1883   if (SelectDirectAddr(N2, Addr)) {
1884     switch (N->getOpcode()) {
1885     default:
1886       return false;
1887     case NVPTXISD::StoreV2:
1888       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1889                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1890                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1891                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1892       break;
1893     case NVPTXISD::StoreV4:
1894       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1895                                NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1896                                NVPTX::STV_i32_v4_avar, std::nullopt,
1897                                NVPTX::STV_f32_v4_avar, std::nullopt);
1898       break;
1899     }
1900     StOps.push_back(Addr);
1901   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1902                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1903     switch (N->getOpcode()) {
1904     default:
1905       return false;
1906     case NVPTXISD::StoreV2:
1907       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1908                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1909                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1910                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1911       break;
1912     case NVPTXISD::StoreV4:
1913       Opcode =
1914           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1915                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1916                           std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1917       break;
1918     }
1919     StOps.push_back(Base);
1920     StOps.push_back(Offset);
1921   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1922                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1923     if (PointerSize == 64) {
1924       switch (N->getOpcode()) {
1925       default:
1926         return false;
1927       case NVPTXISD::StoreV2:
1928         Opcode =
1929             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1930                             NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1931                             NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1932                             NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1933         break;
1934       case NVPTXISD::StoreV4:
1935         Opcode = pickOpcodeForVT(
1936             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1937             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1938             NVPTX::STV_f32_v4_ari_64, std::nullopt);
1939         break;
1940       }
1941     } else {
1942       switch (N->getOpcode()) {
1943       default:
1944         return false;
1945       case NVPTXISD::StoreV2:
1946         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1947                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1948                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1949                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1950         break;
1951       case NVPTXISD::StoreV4:
1952         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1953                                  NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1954                                  NVPTX::STV_i32_v4_ari, std::nullopt,
1955                                  NVPTX::STV_f32_v4_ari, std::nullopt);
1956         break;
1957       }
1958     }
1959     StOps.push_back(Base);
1960     StOps.push_back(Offset);
1961   } else {
1962     if (PointerSize == 64) {
1963       switch (N->getOpcode()) {
1964       default:
1965         return false;
1966       case NVPTXISD::StoreV2:
1967         Opcode = pickOpcodeForVT(
1968             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1969             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1970             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1971             NVPTX::STV_f64_v2_areg_64);
1972         break;
1973       case NVPTXISD::StoreV4:
1974         Opcode = pickOpcodeForVT(
1975             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1976             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1977             NVPTX::STV_f32_v4_areg_64, std::nullopt);
1978         break;
1979       }
1980     } else {
1981       switch (N->getOpcode()) {
1982       default:
1983         return false;
1984       case NVPTXISD::StoreV2:
1985         Opcode =
1986             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1987                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1988                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1989                             NVPTX::STV_f64_v2_areg);
1990         break;
1991       case NVPTXISD::StoreV4:
1992         Opcode =
1993             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1994                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1995                             std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1996         break;
1997       }
1998     }
1999     StOps.push_back(N2);
2000   }
2001 
2002   if (!Opcode)
2003     return false;
2004 
2005   StOps.push_back(Chain);
2006 
2007   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2008 
2009   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2010   CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2011 
2012   ReplaceNode(N, ST);
2013   return true;
2014 }
2015 
2016 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2017   SDValue Chain = Node->getOperand(0);
2018   SDValue Offset = Node->getOperand(2);
2019   SDValue Glue = Node->getOperand(3);
2020   SDLoc DL(Node);
2021   MemSDNode *Mem = cast<MemSDNode>(Node);
2022 
2023   unsigned VecSize;
2024   switch (Node->getOpcode()) {
2025   default:
2026     return false;
2027   case NVPTXISD::LoadParam:
2028     VecSize = 1;
2029     break;
2030   case NVPTXISD::LoadParamV2:
2031     VecSize = 2;
2032     break;
2033   case NVPTXISD::LoadParamV4:
2034     VecSize = 4;
2035     break;
2036   }
2037 
2038   EVT EltVT = Node->getValueType(0);
2039   EVT MemVT = Mem->getMemoryVT();
2040 
2041   std::optional<unsigned> Opcode;
2042 
2043   switch (VecSize) {
2044   default:
2045     return false;
2046   case 1:
2047     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2048                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2049                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2050                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2051     break;
2052   case 2:
2053     Opcode =
2054         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2055                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2056                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2057                         NVPTX::LoadParamMemV2F64);
2058     break;
2059   case 4:
2060     Opcode =
2061         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2062                         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2063                         std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2064     break;
2065   }
2066   if (!Opcode)
2067     return false;
2068 
2069   SDVTList VTs;
2070   if (VecSize == 1) {
2071     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2072   } else if (VecSize == 2) {
2073     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2074   } else {
2075     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2076     VTs = CurDAG->getVTList(EVTs);
2077   }
2078 
2079   unsigned OffsetVal = Offset->getAsZExtVal();
2080 
2081   SmallVector<SDValue, 2> Ops;
2082   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2083   Ops.push_back(Chain);
2084   Ops.push_back(Glue);
2085 
2086   ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2087   return true;
2088 }
2089 
2090 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2091   SDLoc DL(N);
2092   SDValue Chain = N->getOperand(0);
2093   SDValue Offset = N->getOperand(1);
2094   unsigned OffsetVal = Offset->getAsZExtVal();
2095   MemSDNode *Mem = cast<MemSDNode>(N);
2096 
2097   // How many elements do we have?
2098   unsigned NumElts = 1;
2099   switch (N->getOpcode()) {
2100   default:
2101     return false;
2102   case NVPTXISD::StoreRetval:
2103     NumElts = 1;
2104     break;
2105   case NVPTXISD::StoreRetvalV2:
2106     NumElts = 2;
2107     break;
2108   case NVPTXISD::StoreRetvalV4:
2109     NumElts = 4;
2110     break;
2111   }
2112 
2113   // Build vector of operands
2114   SmallVector<SDValue, 6> Ops;
2115   for (unsigned i = 0; i < NumElts; ++i)
2116     Ops.push_back(N->getOperand(i + 2));
2117   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2118   Ops.push_back(Chain);
2119 
2120   // Determine target opcode
2121   // If we have an i1, use an 8-bit store. The lowering code in
2122   // NVPTXISelLowering will have already emitted an upcast.
2123   std::optional<unsigned> Opcode = 0;
2124   switch (NumElts) {
2125   default:
2126     return false;
2127   case 1:
2128     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2129                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2130                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2131                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2132     break;
2133   case 2:
2134     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2135                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2136                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2137                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2138     break;
2139   case 4:
2140     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2141                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2142                              NVPTX::StoreRetvalV4I32, std::nullopt,
2143                              NVPTX::StoreRetvalV4F32, std::nullopt);
2144     break;
2145   }
2146   if (!Opcode)
2147     return false;
2148 
2149   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2150   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2151   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2152 
2153   ReplaceNode(N, Ret);
2154   return true;
2155 }
2156 
2157 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2158   SDLoc DL(N);
2159   SDValue Chain = N->getOperand(0);
2160   SDValue Param = N->getOperand(1);
2161   unsigned ParamVal = Param->getAsZExtVal();
2162   SDValue Offset = N->getOperand(2);
2163   unsigned OffsetVal = Offset->getAsZExtVal();
2164   MemSDNode *Mem = cast<MemSDNode>(N);
2165   SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2166 
2167   // How many elements do we have?
2168   unsigned NumElts = 1;
2169   switch (N->getOpcode()) {
2170   default:
2171     return false;
2172   case NVPTXISD::StoreParamU32:
2173   case NVPTXISD::StoreParamS32:
2174   case NVPTXISD::StoreParam:
2175     NumElts = 1;
2176     break;
2177   case NVPTXISD::StoreParamV2:
2178     NumElts = 2;
2179     break;
2180   case NVPTXISD::StoreParamV4:
2181     NumElts = 4;
2182     break;
2183   }
2184 
2185   // Build vector of operands
2186   SmallVector<SDValue, 8> Ops;
2187   for (unsigned i = 0; i < NumElts; ++i)
2188     Ops.push_back(N->getOperand(i + 3));
2189   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2190   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2191   Ops.push_back(Chain);
2192   Ops.push_back(Glue);
2193 
2194   // Determine target opcode
2195   // If we have an i1, use an 8-bit store. The lowering code in
2196   // NVPTXISelLowering will have already emitted an upcast.
2197   std::optional<unsigned> Opcode = 0;
2198   switch (N->getOpcode()) {
2199   default:
2200     switch (NumElts) {
2201     default:
2202       return false;
2203     case 1:
2204       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2205                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2206                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2207                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2208       break;
2209     case 2:
2210       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2211                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2212                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2213                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2214       break;
2215     case 4:
2216       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2217                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2218                                NVPTX::StoreParamV4I32, std::nullopt,
2219                                NVPTX::StoreParamV4F32, std::nullopt);
2220       break;
2221     }
2222     if (!Opcode)
2223       return false;
2224     break;
2225   // Special case: if we have a sign-extend/zero-extend node, insert the
2226   // conversion instruction first, and use that as the value operand to
2227   // the selected StoreParam node.
2228   case NVPTXISD::StoreParamU32: {
2229     Opcode = NVPTX::StoreParamI32;
2230     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2231                                                 MVT::i32);
2232     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2233                                          MVT::i32, Ops[0], CvtNone);
2234     Ops[0] = SDValue(Cvt, 0);
2235     break;
2236   }
2237   case NVPTXISD::StoreParamS32: {
2238     Opcode = NVPTX::StoreParamI32;
2239     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2240                                                 MVT::i32);
2241     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2242                                          MVT::i32, Ops[0], CvtNone);
2243     Ops[0] = SDValue(Cvt, 0);
2244     break;
2245   }
2246   }
2247 
2248   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2249   SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2250   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2251   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2252 
2253   ReplaceNode(N, Ret);
2254   return true;
2255 }
2256 
2257 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2258   unsigned Opc = 0;
2259 
2260   switch (N->getOpcode()) {
2261   default: return false;
2262   case NVPTXISD::Tex1DFloatS32:
2263     Opc = NVPTX::TEX_1D_F32_S32_RR;
2264     break;
2265   case NVPTXISD::Tex1DFloatFloat:
2266     Opc = NVPTX::TEX_1D_F32_F32_RR;
2267     break;
2268   case NVPTXISD::Tex1DFloatFloatLevel:
2269     Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2270     break;
2271   case NVPTXISD::Tex1DFloatFloatGrad:
2272     Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2273     break;
2274   case NVPTXISD::Tex1DS32S32:
2275     Opc = NVPTX::TEX_1D_S32_S32_RR;
2276     break;
2277   case NVPTXISD::Tex1DS32Float:
2278     Opc = NVPTX::TEX_1D_S32_F32_RR;
2279     break;
2280   case NVPTXISD::Tex1DS32FloatLevel:
2281     Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2282     break;
2283   case NVPTXISD::Tex1DS32FloatGrad:
2284     Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2285     break;
2286   case NVPTXISD::Tex1DU32S32:
2287     Opc = NVPTX::TEX_1D_U32_S32_RR;
2288     break;
2289   case NVPTXISD::Tex1DU32Float:
2290     Opc = NVPTX::TEX_1D_U32_F32_RR;
2291     break;
2292   case NVPTXISD::Tex1DU32FloatLevel:
2293     Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2294     break;
2295   case NVPTXISD::Tex1DU32FloatGrad:
2296     Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2297     break;
2298   case NVPTXISD::Tex1DArrayFloatS32:
2299     Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2300     break;
2301   case NVPTXISD::Tex1DArrayFloatFloat:
2302     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2303     break;
2304   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2305     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2306     break;
2307   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2308     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2309     break;
2310   case NVPTXISD::Tex1DArrayS32S32:
2311     Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2312     break;
2313   case NVPTXISD::Tex1DArrayS32Float:
2314     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2315     break;
2316   case NVPTXISD::Tex1DArrayS32FloatLevel:
2317     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2318     break;
2319   case NVPTXISD::Tex1DArrayS32FloatGrad:
2320     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2321     break;
2322   case NVPTXISD::Tex1DArrayU32S32:
2323     Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2324     break;
2325   case NVPTXISD::Tex1DArrayU32Float:
2326     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2327     break;
2328   case NVPTXISD::Tex1DArrayU32FloatLevel:
2329     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2330     break;
2331   case NVPTXISD::Tex1DArrayU32FloatGrad:
2332     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2333     break;
2334   case NVPTXISD::Tex2DFloatS32:
2335     Opc = NVPTX::TEX_2D_F32_S32_RR;
2336     break;
2337   case NVPTXISD::Tex2DFloatFloat:
2338     Opc = NVPTX::TEX_2D_F32_F32_RR;
2339     break;
2340   case NVPTXISD::Tex2DFloatFloatLevel:
2341     Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2342     break;
2343   case NVPTXISD::Tex2DFloatFloatGrad:
2344     Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2345     break;
2346   case NVPTXISD::Tex2DS32S32:
2347     Opc = NVPTX::TEX_2D_S32_S32_RR;
2348     break;
2349   case NVPTXISD::Tex2DS32Float:
2350     Opc = NVPTX::TEX_2D_S32_F32_RR;
2351     break;
2352   case NVPTXISD::Tex2DS32FloatLevel:
2353     Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2354     break;
2355   case NVPTXISD::Tex2DS32FloatGrad:
2356     Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2357     break;
2358   case NVPTXISD::Tex2DU32S32:
2359     Opc = NVPTX::TEX_2D_U32_S32_RR;
2360     break;
2361   case NVPTXISD::Tex2DU32Float:
2362     Opc = NVPTX::TEX_2D_U32_F32_RR;
2363     break;
2364   case NVPTXISD::Tex2DU32FloatLevel:
2365     Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2366     break;
2367   case NVPTXISD::Tex2DU32FloatGrad:
2368     Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2369     break;
2370   case NVPTXISD::Tex2DArrayFloatS32:
2371     Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2372     break;
2373   case NVPTXISD::Tex2DArrayFloatFloat:
2374     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2375     break;
2376   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2377     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2378     break;
2379   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2380     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2381     break;
2382   case NVPTXISD::Tex2DArrayS32S32:
2383     Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2384     break;
2385   case NVPTXISD::Tex2DArrayS32Float:
2386     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2387     break;
2388   case NVPTXISD::Tex2DArrayS32FloatLevel:
2389     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2390     break;
2391   case NVPTXISD::Tex2DArrayS32FloatGrad:
2392     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2393     break;
2394   case NVPTXISD::Tex2DArrayU32S32:
2395     Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2396     break;
2397   case NVPTXISD::Tex2DArrayU32Float:
2398     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2399     break;
2400   case NVPTXISD::Tex2DArrayU32FloatLevel:
2401     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2402     break;
2403   case NVPTXISD::Tex2DArrayU32FloatGrad:
2404     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2405     break;
2406   case NVPTXISD::Tex3DFloatS32:
2407     Opc = NVPTX::TEX_3D_F32_S32_RR;
2408     break;
2409   case NVPTXISD::Tex3DFloatFloat:
2410     Opc = NVPTX::TEX_3D_F32_F32_RR;
2411     break;
2412   case NVPTXISD::Tex3DFloatFloatLevel:
2413     Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2414     break;
2415   case NVPTXISD::Tex3DFloatFloatGrad:
2416     Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2417     break;
2418   case NVPTXISD::Tex3DS32S32:
2419     Opc = NVPTX::TEX_3D_S32_S32_RR;
2420     break;
2421   case NVPTXISD::Tex3DS32Float:
2422     Opc = NVPTX::TEX_3D_S32_F32_RR;
2423     break;
2424   case NVPTXISD::Tex3DS32FloatLevel:
2425     Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2426     break;
2427   case NVPTXISD::Tex3DS32FloatGrad:
2428     Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2429     break;
2430   case NVPTXISD::Tex3DU32S32:
2431     Opc = NVPTX::TEX_3D_U32_S32_RR;
2432     break;
2433   case NVPTXISD::Tex3DU32Float:
2434     Opc = NVPTX::TEX_3D_U32_F32_RR;
2435     break;
2436   case NVPTXISD::Tex3DU32FloatLevel:
2437     Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2438     break;
2439   case NVPTXISD::Tex3DU32FloatGrad:
2440     Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2441     break;
2442   case NVPTXISD::TexCubeFloatFloat:
2443     Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2444     break;
2445   case NVPTXISD::TexCubeFloatFloatLevel:
2446     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2447     break;
2448   case NVPTXISD::TexCubeS32Float:
2449     Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2450     break;
2451   case NVPTXISD::TexCubeS32FloatLevel:
2452     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2453     break;
2454   case NVPTXISD::TexCubeU32Float:
2455     Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2456     break;
2457   case NVPTXISD::TexCubeU32FloatLevel:
2458     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2459     break;
2460   case NVPTXISD::TexCubeArrayFloatFloat:
2461     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2462     break;
2463   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2464     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2465     break;
2466   case NVPTXISD::TexCubeArrayS32Float:
2467     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2468     break;
2469   case NVPTXISD::TexCubeArrayS32FloatLevel:
2470     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2471     break;
2472   case NVPTXISD::TexCubeArrayU32Float:
2473     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2474     break;
2475   case NVPTXISD::TexCubeArrayU32FloatLevel:
2476     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2477     break;
2478   case NVPTXISD::Tld4R2DFloatFloat:
2479     Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2480     break;
2481   case NVPTXISD::Tld4G2DFloatFloat:
2482     Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2483     break;
2484   case NVPTXISD::Tld4B2DFloatFloat:
2485     Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2486     break;
2487   case NVPTXISD::Tld4A2DFloatFloat:
2488     Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2489     break;
2490   case NVPTXISD::Tld4R2DS64Float:
2491     Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2492     break;
2493   case NVPTXISD::Tld4G2DS64Float:
2494     Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2495     break;
2496   case NVPTXISD::Tld4B2DS64Float:
2497     Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2498     break;
2499   case NVPTXISD::Tld4A2DS64Float:
2500     Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2501     break;
2502   case NVPTXISD::Tld4R2DU64Float:
2503     Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2504     break;
2505   case NVPTXISD::Tld4G2DU64Float:
2506     Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2507     break;
2508   case NVPTXISD::Tld4B2DU64Float:
2509     Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2510     break;
2511   case NVPTXISD::Tld4A2DU64Float:
2512     Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2513     break;
2514   case NVPTXISD::TexUnified1DFloatS32:
2515     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2516     break;
2517   case NVPTXISD::TexUnified1DFloatFloat:
2518     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2519     break;
2520   case NVPTXISD::TexUnified1DFloatFloatLevel:
2521     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2522     break;
2523   case NVPTXISD::TexUnified1DFloatFloatGrad:
2524     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2525     break;
2526   case NVPTXISD::TexUnified1DS32S32:
2527     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2528     break;
2529   case NVPTXISD::TexUnified1DS32Float:
2530     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2531     break;
2532   case NVPTXISD::TexUnified1DS32FloatLevel:
2533     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2534     break;
2535   case NVPTXISD::TexUnified1DS32FloatGrad:
2536     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2537     break;
2538   case NVPTXISD::TexUnified1DU32S32:
2539     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2540     break;
2541   case NVPTXISD::TexUnified1DU32Float:
2542     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2543     break;
2544   case NVPTXISD::TexUnified1DU32FloatLevel:
2545     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2546     break;
2547   case NVPTXISD::TexUnified1DU32FloatGrad:
2548     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2549     break;
2550   case NVPTXISD::TexUnified1DArrayFloatS32:
2551     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2552     break;
2553   case NVPTXISD::TexUnified1DArrayFloatFloat:
2554     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2555     break;
2556   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2557     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2558     break;
2559   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2560     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2561     break;
2562   case NVPTXISD::TexUnified1DArrayS32S32:
2563     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2564     break;
2565   case NVPTXISD::TexUnified1DArrayS32Float:
2566     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2567     break;
2568   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2569     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2570     break;
2571   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2572     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2573     break;
2574   case NVPTXISD::TexUnified1DArrayU32S32:
2575     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2576     break;
2577   case NVPTXISD::TexUnified1DArrayU32Float:
2578     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2579     break;
2580   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2581     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2582     break;
2583   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2584     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2585     break;
2586   case NVPTXISD::TexUnified2DFloatS32:
2587     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2588     break;
2589   case NVPTXISD::TexUnified2DFloatFloat:
2590     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2591     break;
2592   case NVPTXISD::TexUnified2DFloatFloatLevel:
2593     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2594     break;
2595   case NVPTXISD::TexUnified2DFloatFloatGrad:
2596     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2597     break;
2598   case NVPTXISD::TexUnified2DS32S32:
2599     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2600     break;
2601   case NVPTXISD::TexUnified2DS32Float:
2602     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2603     break;
2604   case NVPTXISD::TexUnified2DS32FloatLevel:
2605     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2606     break;
2607   case NVPTXISD::TexUnified2DS32FloatGrad:
2608     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2609     break;
2610   case NVPTXISD::TexUnified2DU32S32:
2611     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2612     break;
2613   case NVPTXISD::TexUnified2DU32Float:
2614     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2615     break;
2616   case NVPTXISD::TexUnified2DU32FloatLevel:
2617     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2618     break;
2619   case NVPTXISD::TexUnified2DU32FloatGrad:
2620     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2621     break;
2622   case NVPTXISD::TexUnified2DArrayFloatS32:
2623     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2624     break;
2625   case NVPTXISD::TexUnified2DArrayFloatFloat:
2626     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2627     break;
2628   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2629     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2630     break;
2631   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2632     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2633     break;
2634   case NVPTXISD::TexUnified2DArrayS32S32:
2635     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2636     break;
2637   case NVPTXISD::TexUnified2DArrayS32Float:
2638     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2639     break;
2640   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2641     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2642     break;
2643   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2644     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2645     break;
2646   case NVPTXISD::TexUnified2DArrayU32S32:
2647     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2648     break;
2649   case NVPTXISD::TexUnified2DArrayU32Float:
2650     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2651     break;
2652   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2653     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2654     break;
2655   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2656     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2657     break;
2658   case NVPTXISD::TexUnified3DFloatS32:
2659     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2660     break;
2661   case NVPTXISD::TexUnified3DFloatFloat:
2662     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2663     break;
2664   case NVPTXISD::TexUnified3DFloatFloatLevel:
2665     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2666     break;
2667   case NVPTXISD::TexUnified3DFloatFloatGrad:
2668     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2669     break;
2670   case NVPTXISD::TexUnified3DS32S32:
2671     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2672     break;
2673   case NVPTXISD::TexUnified3DS32Float:
2674     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2675     break;
2676   case NVPTXISD::TexUnified3DS32FloatLevel:
2677     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2678     break;
2679   case NVPTXISD::TexUnified3DS32FloatGrad:
2680     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2681     break;
2682   case NVPTXISD::TexUnified3DU32S32:
2683     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2684     break;
2685   case NVPTXISD::TexUnified3DU32Float:
2686     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2687     break;
2688   case NVPTXISD::TexUnified3DU32FloatLevel:
2689     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2690     break;
2691   case NVPTXISD::TexUnified3DU32FloatGrad:
2692     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2693     break;
2694   case NVPTXISD::TexUnifiedCubeFloatFloat:
2695     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2696     break;
2697   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2698     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2699     break;
2700   case NVPTXISD::TexUnifiedCubeS32Float:
2701     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2702     break;
2703   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2704     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2705     break;
2706   case NVPTXISD::TexUnifiedCubeU32Float:
2707     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2708     break;
2709   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2710     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2711     break;
2712   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2713     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2714     break;
2715   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2716     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2717     break;
2718   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2719     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2720     break;
2721   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2722     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2723     break;
2724   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2725     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2726     break;
2727   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2728     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2729     break;
2730   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2731     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2732     break;
2733   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2734     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2735     break;
2736   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2737     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2738     break;
2739   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2740     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2741     break;
2742   case NVPTXISD::Tld4UnifiedR2DS64Float:
2743     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2744     break;
2745   case NVPTXISD::Tld4UnifiedG2DS64Float:
2746     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2747     break;
2748   case NVPTXISD::Tld4UnifiedB2DS64Float:
2749     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2750     break;
2751   case NVPTXISD::Tld4UnifiedA2DS64Float:
2752     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2753     break;
2754   case NVPTXISD::Tld4UnifiedR2DU64Float:
2755     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2756     break;
2757   case NVPTXISD::Tld4UnifiedG2DU64Float:
2758     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2759     break;
2760   case NVPTXISD::Tld4UnifiedB2DU64Float:
2761     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2762     break;
2763   case NVPTXISD::Tld4UnifiedA2DU64Float:
2764     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2765     break;
2766   }
2767 
2768   // Copy over operands
2769   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2770   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2771 
2772   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2773   return true;
2774 }
2775 
2776 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2777   unsigned Opc = 0;
2778   switch (N->getOpcode()) {
2779   default: return false;
2780   case NVPTXISD::Suld1DI8Clamp:
2781     Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2782     break;
2783   case NVPTXISD::Suld1DI16Clamp:
2784     Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2785     break;
2786   case NVPTXISD::Suld1DI32Clamp:
2787     Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2788     break;
2789   case NVPTXISD::Suld1DI64Clamp:
2790     Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2791     break;
2792   case NVPTXISD::Suld1DV2I8Clamp:
2793     Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2794     break;
2795   case NVPTXISD::Suld1DV2I16Clamp:
2796     Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2797     break;
2798   case NVPTXISD::Suld1DV2I32Clamp:
2799     Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2800     break;
2801   case NVPTXISD::Suld1DV2I64Clamp:
2802     Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2803     break;
2804   case NVPTXISD::Suld1DV4I8Clamp:
2805     Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2806     break;
2807   case NVPTXISD::Suld1DV4I16Clamp:
2808     Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2809     break;
2810   case NVPTXISD::Suld1DV4I32Clamp:
2811     Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2812     break;
2813   case NVPTXISD::Suld1DArrayI8Clamp:
2814     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2815     break;
2816   case NVPTXISD::Suld1DArrayI16Clamp:
2817     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2818     break;
2819   case NVPTXISD::Suld1DArrayI32Clamp:
2820     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2821     break;
2822   case NVPTXISD::Suld1DArrayI64Clamp:
2823     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2824     break;
2825   case NVPTXISD::Suld1DArrayV2I8Clamp:
2826     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2827     break;
2828   case NVPTXISD::Suld1DArrayV2I16Clamp:
2829     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2830     break;
2831   case NVPTXISD::Suld1DArrayV2I32Clamp:
2832     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2833     break;
2834   case NVPTXISD::Suld1DArrayV2I64Clamp:
2835     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2836     break;
2837   case NVPTXISD::Suld1DArrayV4I8Clamp:
2838     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2839     break;
2840   case NVPTXISD::Suld1DArrayV4I16Clamp:
2841     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2842     break;
2843   case NVPTXISD::Suld1DArrayV4I32Clamp:
2844     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2845     break;
2846   case NVPTXISD::Suld2DI8Clamp:
2847     Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2848     break;
2849   case NVPTXISD::Suld2DI16Clamp:
2850     Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2851     break;
2852   case NVPTXISD::Suld2DI32Clamp:
2853     Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2854     break;
2855   case NVPTXISD::Suld2DI64Clamp:
2856     Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2857     break;
2858   case NVPTXISD::Suld2DV2I8Clamp:
2859     Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2860     break;
2861   case NVPTXISD::Suld2DV2I16Clamp:
2862     Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2863     break;
2864   case NVPTXISD::Suld2DV2I32Clamp:
2865     Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2866     break;
2867   case NVPTXISD::Suld2DV2I64Clamp:
2868     Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2869     break;
2870   case NVPTXISD::Suld2DV4I8Clamp:
2871     Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2872     break;
2873   case NVPTXISD::Suld2DV4I16Clamp:
2874     Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2875     break;
2876   case NVPTXISD::Suld2DV4I32Clamp:
2877     Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2878     break;
2879   case NVPTXISD::Suld2DArrayI8Clamp:
2880     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2881     break;
2882   case NVPTXISD::Suld2DArrayI16Clamp:
2883     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2884     break;
2885   case NVPTXISD::Suld2DArrayI32Clamp:
2886     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2887     break;
2888   case NVPTXISD::Suld2DArrayI64Clamp:
2889     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2890     break;
2891   case NVPTXISD::Suld2DArrayV2I8Clamp:
2892     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2893     break;
2894   case NVPTXISD::Suld2DArrayV2I16Clamp:
2895     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2896     break;
2897   case NVPTXISD::Suld2DArrayV2I32Clamp:
2898     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2899     break;
2900   case NVPTXISD::Suld2DArrayV2I64Clamp:
2901     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2902     break;
2903   case NVPTXISD::Suld2DArrayV4I8Clamp:
2904     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2905     break;
2906   case NVPTXISD::Suld2DArrayV4I16Clamp:
2907     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2908     break;
2909   case NVPTXISD::Suld2DArrayV4I32Clamp:
2910     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2911     break;
2912   case NVPTXISD::Suld3DI8Clamp:
2913     Opc = NVPTX::SULD_3D_I8_CLAMP_R;
2914     break;
2915   case NVPTXISD::Suld3DI16Clamp:
2916     Opc = NVPTX::SULD_3D_I16_CLAMP_R;
2917     break;
2918   case NVPTXISD::Suld3DI32Clamp:
2919     Opc = NVPTX::SULD_3D_I32_CLAMP_R;
2920     break;
2921   case NVPTXISD::Suld3DI64Clamp:
2922     Opc = NVPTX::SULD_3D_I64_CLAMP_R;
2923     break;
2924   case NVPTXISD::Suld3DV2I8Clamp:
2925     Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
2926     break;
2927   case NVPTXISD::Suld3DV2I16Clamp:
2928     Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
2929     break;
2930   case NVPTXISD::Suld3DV2I32Clamp:
2931     Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
2932     break;
2933   case NVPTXISD::Suld3DV2I64Clamp:
2934     Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
2935     break;
2936   case NVPTXISD::Suld3DV4I8Clamp:
2937     Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
2938     break;
2939   case NVPTXISD::Suld3DV4I16Clamp:
2940     Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
2941     break;
2942   case NVPTXISD::Suld3DV4I32Clamp:
2943     Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
2944     break;
2945   case NVPTXISD::Suld1DI8Trap:
2946     Opc = NVPTX::SULD_1D_I8_TRAP_R;
2947     break;
2948   case NVPTXISD::Suld1DI16Trap:
2949     Opc = NVPTX::SULD_1D_I16_TRAP_R;
2950     break;
2951   case NVPTXISD::Suld1DI32Trap:
2952     Opc = NVPTX::SULD_1D_I32_TRAP_R;
2953     break;
2954   case NVPTXISD::Suld1DI64Trap:
2955     Opc = NVPTX::SULD_1D_I64_TRAP_R;
2956     break;
2957   case NVPTXISD::Suld1DV2I8Trap:
2958     Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
2959     break;
2960   case NVPTXISD::Suld1DV2I16Trap:
2961     Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
2962     break;
2963   case NVPTXISD::Suld1DV2I32Trap:
2964     Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
2965     break;
2966   case NVPTXISD::Suld1DV2I64Trap:
2967     Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
2968     break;
2969   case NVPTXISD::Suld1DV4I8Trap:
2970     Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
2971     break;
2972   case NVPTXISD::Suld1DV4I16Trap:
2973     Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
2974     break;
2975   case NVPTXISD::Suld1DV4I32Trap:
2976     Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
2977     break;
2978   case NVPTXISD::Suld1DArrayI8Trap:
2979     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
2980     break;
2981   case NVPTXISD::Suld1DArrayI16Trap:
2982     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
2983     break;
2984   case NVPTXISD::Suld1DArrayI32Trap:
2985     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
2986     break;
2987   case NVPTXISD::Suld1DArrayI64Trap:
2988     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
2989     break;
2990   case NVPTXISD::Suld1DArrayV2I8Trap:
2991     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
2992     break;
2993   case NVPTXISD::Suld1DArrayV2I16Trap:
2994     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
2995     break;
2996   case NVPTXISD::Suld1DArrayV2I32Trap:
2997     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
2998     break;
2999   case NVPTXISD::Suld1DArrayV2I64Trap:
3000     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3001     break;
3002   case NVPTXISD::Suld1DArrayV4I8Trap:
3003     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3004     break;
3005   case NVPTXISD::Suld1DArrayV4I16Trap:
3006     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3007     break;
3008   case NVPTXISD::Suld1DArrayV4I32Trap:
3009     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3010     break;
3011   case NVPTXISD::Suld2DI8Trap:
3012     Opc = NVPTX::SULD_2D_I8_TRAP_R;
3013     break;
3014   case NVPTXISD::Suld2DI16Trap:
3015     Opc = NVPTX::SULD_2D_I16_TRAP_R;
3016     break;
3017   case NVPTXISD::Suld2DI32Trap:
3018     Opc = NVPTX::SULD_2D_I32_TRAP_R;
3019     break;
3020   case NVPTXISD::Suld2DI64Trap:
3021     Opc = NVPTX::SULD_2D_I64_TRAP_R;
3022     break;
3023   case NVPTXISD::Suld2DV2I8Trap:
3024     Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3025     break;
3026   case NVPTXISD::Suld2DV2I16Trap:
3027     Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3028     break;
3029   case NVPTXISD::Suld2DV2I32Trap:
3030     Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3031     break;
3032   case NVPTXISD::Suld2DV2I64Trap:
3033     Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3034     break;
3035   case NVPTXISD::Suld2DV4I8Trap:
3036     Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3037     break;
3038   case NVPTXISD::Suld2DV4I16Trap:
3039     Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3040     break;
3041   case NVPTXISD::Suld2DV4I32Trap:
3042     Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3043     break;
3044   case NVPTXISD::Suld2DArrayI8Trap:
3045     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3046     break;
3047   case NVPTXISD::Suld2DArrayI16Trap:
3048     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3049     break;
3050   case NVPTXISD::Suld2DArrayI32Trap:
3051     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3052     break;
3053   case NVPTXISD::Suld2DArrayI64Trap:
3054     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3055     break;
3056   case NVPTXISD::Suld2DArrayV2I8Trap:
3057     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3058     break;
3059   case NVPTXISD::Suld2DArrayV2I16Trap:
3060     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3061     break;
3062   case NVPTXISD::Suld2DArrayV2I32Trap:
3063     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3064     break;
3065   case NVPTXISD::Suld2DArrayV2I64Trap:
3066     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3067     break;
3068   case NVPTXISD::Suld2DArrayV4I8Trap:
3069     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3070     break;
3071   case NVPTXISD::Suld2DArrayV4I16Trap:
3072     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3073     break;
3074   case NVPTXISD::Suld2DArrayV4I32Trap:
3075     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3076     break;
3077   case NVPTXISD::Suld3DI8Trap:
3078     Opc = NVPTX::SULD_3D_I8_TRAP_R;
3079     break;
3080   case NVPTXISD::Suld3DI16Trap:
3081     Opc = NVPTX::SULD_3D_I16_TRAP_R;
3082     break;
3083   case NVPTXISD::Suld3DI32Trap:
3084     Opc = NVPTX::SULD_3D_I32_TRAP_R;
3085     break;
3086   case NVPTXISD::Suld3DI64Trap:
3087     Opc = NVPTX::SULD_3D_I64_TRAP_R;
3088     break;
3089   case NVPTXISD::Suld3DV2I8Trap:
3090     Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3091     break;
3092   case NVPTXISD::Suld3DV2I16Trap:
3093     Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3094     break;
3095   case NVPTXISD::Suld3DV2I32Trap:
3096     Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3097     break;
3098   case NVPTXISD::Suld3DV2I64Trap:
3099     Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3100     break;
3101   case NVPTXISD::Suld3DV4I8Trap:
3102     Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3103     break;
3104   case NVPTXISD::Suld3DV4I16Trap:
3105     Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3106     break;
3107   case NVPTXISD::Suld3DV4I32Trap:
3108     Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3109     break;
3110   case NVPTXISD::Suld1DI8Zero:
3111     Opc = NVPTX::SULD_1D_I8_ZERO_R;
3112     break;
3113   case NVPTXISD::Suld1DI16Zero:
3114     Opc = NVPTX::SULD_1D_I16_ZERO_R;
3115     break;
3116   case NVPTXISD::Suld1DI32Zero:
3117     Opc = NVPTX::SULD_1D_I32_ZERO_R;
3118     break;
3119   case NVPTXISD::Suld1DI64Zero:
3120     Opc = NVPTX::SULD_1D_I64_ZERO_R;
3121     break;
3122   case NVPTXISD::Suld1DV2I8Zero:
3123     Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3124     break;
3125   case NVPTXISD::Suld1DV2I16Zero:
3126     Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3127     break;
3128   case NVPTXISD::Suld1DV2I32Zero:
3129     Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3130     break;
3131   case NVPTXISD::Suld1DV2I64Zero:
3132     Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3133     break;
3134   case NVPTXISD::Suld1DV4I8Zero:
3135     Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3136     break;
3137   case NVPTXISD::Suld1DV4I16Zero:
3138     Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3139     break;
3140   case NVPTXISD::Suld1DV4I32Zero:
3141     Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3142     break;
3143   case NVPTXISD::Suld1DArrayI8Zero:
3144     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3145     break;
3146   case NVPTXISD::Suld1DArrayI16Zero:
3147     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3148     break;
3149   case NVPTXISD::Suld1DArrayI32Zero:
3150     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3151     break;
3152   case NVPTXISD::Suld1DArrayI64Zero:
3153     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3154     break;
3155   case NVPTXISD::Suld1DArrayV2I8Zero:
3156     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3157     break;
3158   case NVPTXISD::Suld1DArrayV2I16Zero:
3159     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3160     break;
3161   case NVPTXISD::Suld1DArrayV2I32Zero:
3162     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3163     break;
3164   case NVPTXISD::Suld1DArrayV2I64Zero:
3165     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3166     break;
3167   case NVPTXISD::Suld1DArrayV4I8Zero:
3168     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3169     break;
3170   case NVPTXISD::Suld1DArrayV4I16Zero:
3171     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3172     break;
3173   case NVPTXISD::Suld1DArrayV4I32Zero:
3174     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3175     break;
3176   case NVPTXISD::Suld2DI8Zero:
3177     Opc = NVPTX::SULD_2D_I8_ZERO_R;
3178     break;
3179   case NVPTXISD::Suld2DI16Zero:
3180     Opc = NVPTX::SULD_2D_I16_ZERO_R;
3181     break;
3182   case NVPTXISD::Suld2DI32Zero:
3183     Opc = NVPTX::SULD_2D_I32_ZERO_R;
3184     break;
3185   case NVPTXISD::Suld2DI64Zero:
3186     Opc = NVPTX::SULD_2D_I64_ZERO_R;
3187     break;
3188   case NVPTXISD::Suld2DV2I8Zero:
3189     Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3190     break;
3191   case NVPTXISD::Suld2DV2I16Zero:
3192     Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3193     break;
3194   case NVPTXISD::Suld2DV2I32Zero:
3195     Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3196     break;
3197   case NVPTXISD::Suld2DV2I64Zero:
3198     Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3199     break;
3200   case NVPTXISD::Suld2DV4I8Zero:
3201     Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3202     break;
3203   case NVPTXISD::Suld2DV4I16Zero:
3204     Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3205     break;
3206   case NVPTXISD::Suld2DV4I32Zero:
3207     Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3208     break;
3209   case NVPTXISD::Suld2DArrayI8Zero:
3210     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3211     break;
3212   case NVPTXISD::Suld2DArrayI16Zero:
3213     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3214     break;
3215   case NVPTXISD::Suld2DArrayI32Zero:
3216     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3217     break;
3218   case NVPTXISD::Suld2DArrayI64Zero:
3219     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3220     break;
3221   case NVPTXISD::Suld2DArrayV2I8Zero:
3222     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3223     break;
3224   case NVPTXISD::Suld2DArrayV2I16Zero:
3225     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3226     break;
3227   case NVPTXISD::Suld2DArrayV2I32Zero:
3228     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3229     break;
3230   case NVPTXISD::Suld2DArrayV2I64Zero:
3231     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3232     break;
3233   case NVPTXISD::Suld2DArrayV4I8Zero:
3234     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3235     break;
3236   case NVPTXISD::Suld2DArrayV4I16Zero:
3237     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3238     break;
3239   case NVPTXISD::Suld2DArrayV4I32Zero:
3240     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3241     break;
3242   case NVPTXISD::Suld3DI8Zero:
3243     Opc = NVPTX::SULD_3D_I8_ZERO_R;
3244     break;
3245   case NVPTXISD::Suld3DI16Zero:
3246     Opc = NVPTX::SULD_3D_I16_ZERO_R;
3247     break;
3248   case NVPTXISD::Suld3DI32Zero:
3249     Opc = NVPTX::SULD_3D_I32_ZERO_R;
3250     break;
3251   case NVPTXISD::Suld3DI64Zero:
3252     Opc = NVPTX::SULD_3D_I64_ZERO_R;
3253     break;
3254   case NVPTXISD::Suld3DV2I8Zero:
3255     Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3256     break;
3257   case NVPTXISD::Suld3DV2I16Zero:
3258     Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3259     break;
3260   case NVPTXISD::Suld3DV2I32Zero:
3261     Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3262     break;
3263   case NVPTXISD::Suld3DV2I64Zero:
3264     Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3265     break;
3266   case NVPTXISD::Suld3DV4I8Zero:
3267     Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3268     break;
3269   case NVPTXISD::Suld3DV4I16Zero:
3270     Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3271     break;
3272   case NVPTXISD::Suld3DV4I32Zero:
3273     Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3274     break;
3275   }
3276 
3277   // Copy over operands
3278   SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3279   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3280 
3281   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3282   return true;
3283 }
3284 
3285 
3286 /// SelectBFE - Look for instruction sequences that can be made more efficient
3287 /// by using the 'bfe' (bit-field extract) PTX instruction
3288 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3289   SDLoc DL(N);
3290   SDValue LHS = N->getOperand(0);
3291   SDValue RHS = N->getOperand(1);
3292   SDValue Len;
3293   SDValue Start;
3294   SDValue Val;
3295   bool IsSigned = false;
3296 
3297   if (N->getOpcode() == ISD::AND) {
3298     // Canonicalize the operands
3299     // We want 'and %val, %mask'
3300     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3301       std::swap(LHS, RHS);
3302     }
3303 
3304     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3305     if (!Mask) {
3306       // We need a constant mask on the RHS of the AND
3307       return false;
3308     }
3309 
3310     // Extract the mask bits
3311     uint64_t MaskVal = Mask->getZExtValue();
3312     if (!isMask_64(MaskVal)) {
3313       // We *could* handle shifted masks here, but doing so would require an
3314       // 'and' operation to fix up the low-order bits so we would trade
3315       // shr+and for bfe+and, which has the same throughput
3316       return false;
3317     }
3318 
3319     // How many bits are in our mask?
3320     int64_t NumBits = countr_one(MaskVal);
3321     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3322 
3323     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3324       // We have a 'srl/and' pair, extract the effective start bit and length
3325       Val = LHS.getNode()->getOperand(0);
3326       Start = LHS.getNode()->getOperand(1);
3327       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3328       if (StartConst) {
3329         uint64_t StartVal = StartConst->getZExtValue();
3330         // How many "good" bits do we have left?  "good" is defined here as bits
3331         // that exist in the original value, not shifted in.
3332         int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3333         if (NumBits > GoodBits) {
3334           // Do not handle the case where bits have been shifted in. In theory
3335           // we could handle this, but the cost is likely higher than just
3336           // emitting the srl/and pair.
3337           return false;
3338         }
3339         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3340       } else {
3341         // Do not handle the case where the shift amount (can be zero if no srl
3342         // was found) is not constant. We could handle this case, but it would
3343         // require run-time logic that would be more expensive than just
3344         // emitting the srl/and pair.
3345         return false;
3346       }
3347     } else {
3348       // Do not handle the case where the LHS of the and is not a shift. While
3349       // it would be trivial to handle this case, it would just transform
3350       // 'and' -> 'bfe', but 'and' has higher-throughput.
3351       return false;
3352     }
3353   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3354     if (LHS->getOpcode() == ISD::AND) {
3355       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3356       if (!ShiftCnst) {
3357         // Shift amount must be constant
3358         return false;
3359       }
3360 
3361       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3362 
3363       SDValue AndLHS = LHS->getOperand(0);
3364       SDValue AndRHS = LHS->getOperand(1);
3365 
3366       // Canonicalize the AND to have the mask on the RHS
3367       if (isa<ConstantSDNode>(AndLHS)) {
3368         std::swap(AndLHS, AndRHS);
3369       }
3370 
3371       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3372       if (!MaskCnst) {
3373         // Mask must be constant
3374         return false;
3375       }
3376 
3377       uint64_t MaskVal = MaskCnst->getZExtValue();
3378       uint64_t NumZeros;
3379       uint64_t NumBits;
3380       if (isMask_64(MaskVal)) {
3381         NumZeros = 0;
3382         // The number of bits in the result bitfield will be the number of
3383         // trailing ones (the AND) minus the number of bits we shift off
3384         NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3385       } else if (isShiftedMask_64(MaskVal)) {
3386         NumZeros = llvm::countr_zero(MaskVal);
3387         unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3388         // The number of bits in the result bitfield will be the number of
3389         // trailing zeros plus the number of set bits in the mask minus the
3390         // number of bits we shift off
3391         NumBits = NumZeros + NumOnes - ShiftAmt;
3392       } else {
3393         // This is not a mask we can handle
3394         return false;
3395       }
3396 
3397       if (ShiftAmt < NumZeros) {
3398         // Handling this case would require extra logic that would make this
3399         // transformation non-profitable
3400         return false;
3401       }
3402 
3403       Val = AndLHS;
3404       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3405       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3406     } else if (LHS->getOpcode() == ISD::SHL) {
3407       // Here, we have a pattern like:
3408       //
3409       // (sra (shl val, NN), MM)
3410       // or
3411       // (srl (shl val, NN), MM)
3412       //
3413       // If MM >= NN, we can efficiently optimize this with bfe
3414       Val = LHS->getOperand(0);
3415 
3416       SDValue ShlRHS = LHS->getOperand(1);
3417       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3418       if (!ShlCnst) {
3419         // Shift amount must be constant
3420         return false;
3421       }
3422       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3423 
3424       SDValue ShrRHS = RHS;
3425       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3426       if (!ShrCnst) {
3427         // Shift amount must be constant
3428         return false;
3429       }
3430       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3431 
3432       // To avoid extra codegen and be profitable, we need Outer >= Inner
3433       if (OuterShiftAmt < InnerShiftAmt) {
3434         return false;
3435       }
3436 
3437       // If the outer shift is more than the type size, we have no bitfield to
3438       // extract (since we also check that the inner shift is <= the outer shift
3439       // then this also implies that the inner shift is < the type size)
3440       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3441         return false;
3442       }
3443 
3444       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3445                                         MVT::i32);
3446       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3447                                       DL, MVT::i32);
3448 
3449       if (N->getOpcode() == ISD::SRA) {
3450         // If we have a arithmetic right shift, we need to use the signed bfe
3451         // variant
3452         IsSigned = true;
3453       }
3454     } else {
3455       // No can do...
3456       return false;
3457     }
3458   } else {
3459     // No can do...
3460     return false;
3461   }
3462 
3463 
3464   unsigned Opc;
3465   // For the BFE operations we form here from "and" and "srl", always use the
3466   // unsigned variants.
3467   if (Val.getValueType() == MVT::i32) {
3468     if (IsSigned) {
3469       Opc = NVPTX::BFE_S32rii;
3470     } else {
3471       Opc = NVPTX::BFE_U32rii;
3472     }
3473   } else if (Val.getValueType() == MVT::i64) {
3474     if (IsSigned) {
3475       Opc = NVPTX::BFE_S64rii;
3476     } else {
3477       Opc = NVPTX::BFE_U64rii;
3478     }
3479   } else {
3480     // We cannot handle this type
3481     return false;
3482   }
3483 
3484   SDValue Ops[] = {
3485     Val, Start, Len
3486   };
3487 
3488   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3489   return true;
3490 }
3491 
3492 // SelectDirectAddr - Match a direct address for DAG.
3493 // A direct address could be a globaladdress or externalsymbol.
3494 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3495   // Return true if TGA or ES.
3496   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3497       N.getOpcode() == ISD::TargetExternalSymbol) {
3498     Address = N;
3499     return true;
3500   }
3501   if (N.getOpcode() == NVPTXISD::Wrapper) {
3502     Address = N.getOperand(0);
3503     return true;
3504   }
3505   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3506   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3507     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3508         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3509         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3510       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3511   }
3512   return false;
3513 }
3514 
3515 // symbol+offset
3516 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3517     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3518   if (Addr.getOpcode() == ISD::ADD) {
3519     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3520       SDValue base = Addr.getOperand(0);
3521       if (SelectDirectAddr(base, Base)) {
3522         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3523                                            mvt);
3524         return true;
3525       }
3526     }
3527   }
3528   return false;
3529 }
3530 
3531 // symbol+offset
3532 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3533                                      SDValue &Base, SDValue &Offset) {
3534   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3535 }
3536 
3537 // symbol+offset
3538 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3539                                        SDValue &Base, SDValue &Offset) {
3540   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3541 }
3542 
3543 // register+offset
3544 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3545     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3546   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3547     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3548     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3549     return true;
3550   }
3551   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3552       Addr.getOpcode() == ISD::TargetGlobalAddress)
3553     return false; // direct calls.
3554 
3555   if (Addr.getOpcode() == ISD::ADD) {
3556     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3557       return false;
3558     }
3559     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3560       if (FrameIndexSDNode *FIN =
3561               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3562         // Constant offset from frame ref.
3563         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3564       else
3565         Base = Addr.getOperand(0);
3566       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3567                                          mvt);
3568       return true;
3569     }
3570   }
3571   return false;
3572 }
3573 
3574 // register+offset
3575 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3576                                      SDValue &Base, SDValue &Offset) {
3577   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3578 }
3579 
3580 // register+offset
3581 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3582                                        SDValue &Base, SDValue &Offset) {
3583   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3584 }
3585 
3586 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3587                                                  unsigned int spN) const {
3588   const Value *Src = nullptr;
3589   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3590     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3591       return true;
3592     Src = mN->getMemOperand()->getValue();
3593   }
3594   if (!Src)
3595     return false;
3596   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3597     return (PT->getAddressSpace() == spN);
3598   return false;
3599 }
3600 
3601 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3602 /// inline asm expressions.
3603 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3604     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3605     std::vector<SDValue> &OutOps) {
3606   SDValue Op0, Op1;
3607   switch (ConstraintID) {
3608   default:
3609     return true;
3610   case InlineAsm::ConstraintCode::m: // memory
3611     if (SelectDirectAddr(Op, Op0)) {
3612       OutOps.push_back(Op0);
3613       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3614       return false;
3615     }
3616     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3617       OutOps.push_back(Op0);
3618       OutOps.push_back(Op1);
3619       return false;
3620     }
3621     break;
3622   }
3623   return true;
3624 }
3625 
3626 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3627 /// conversion from \p SrcTy to \p DestTy.
3628 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3629                                              LoadSDNode *LdNode) {
3630   bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3631   switch (SrcTy.SimpleTy) {
3632   default:
3633     llvm_unreachable("Unhandled source type");
3634   case MVT::i8:
3635     switch (DestTy.SimpleTy) {
3636     default:
3637       llvm_unreachable("Unhandled dest type");
3638     case MVT::i16:
3639       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3640     case MVT::i32:
3641       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3642     case MVT::i64:
3643       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3644     }
3645   case MVT::i16:
3646     switch (DestTy.SimpleTy) {
3647     default:
3648       llvm_unreachable("Unhandled dest type");
3649     case MVT::i8:
3650       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3651     case MVT::i32:
3652       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3653     case MVT::i64:
3654       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3655     }
3656   case MVT::i32:
3657     switch (DestTy.SimpleTy) {
3658     default:
3659       llvm_unreachable("Unhandled dest type");
3660     case MVT::i8:
3661       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3662     case MVT::i16:
3663       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3664     case MVT::i64:
3665       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3666     }
3667   case MVT::i64:
3668     switch (DestTy.SimpleTy) {
3669     default:
3670       llvm_unreachable("Unhandled dest type");
3671     case MVT::i8:
3672       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3673     case MVT::i16:
3674       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3675     case MVT::i32:
3676       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3677     }
3678   case MVT::f16:
3679     switch (DestTy.SimpleTy) {
3680     default:
3681       llvm_unreachable("Unhandled dest type");
3682     case MVT::f32:
3683       return NVPTX::CVT_f32_f16;
3684     case MVT::f64:
3685       return NVPTX::CVT_f64_f16;
3686     }
3687   }
3688 }
3689