xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/ISDOpcodes.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineMemOperand.h"
29 #include "llvm/CodeGen/MachineValueType.h"
30 #include "llvm/CodeGen/SelectionDAG.h"
31 #include "llvm/CodeGen/SelectionDAGNodes.h"
32 #include "llvm/CodeGen/TargetCallingConv.h"
33 #include "llvm/CodeGen/TargetLowering.h"
34 #include "llvm/CodeGen/ValueTypes.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/FPEnv.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalValue.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicsNVPTX.h"
47 #include "llvm/IR/Module.h"
48 #include "llvm/IR/Type.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Casting.h"
51 #include "llvm/Support/CodeGen.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/raw_ostream.h"
55 #include "llvm/Target/TargetMachine.h"
56 #include "llvm/Target/TargetOptions.h"
57 #include <algorithm>
58 #include <cassert>
59 #include <cmath>
60 #include <cstdint>
61 #include <iterator>
62 #include <sstream>
63 #include <string>
64 #include <utility>
65 #include <vector>
66 
67 #define DEBUG_TYPE "nvptx-lower"
68 
69 using namespace llvm;
70 
71 static std::atomic<unsigned> GlobalUniqueCallSite;
72 
73 static cl::opt<bool> sched4reg(
74     "nvptx-sched4reg",
75     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
76 
77 static cl::opt<unsigned> FMAContractLevelOpt(
78     "nvptx-fma-level", cl::Hidden,
79     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
80              " 1: do it  2: do it aggressively"),
81     cl::init(2));
82 
83 static cl::opt<int> UsePrecDivF32(
84     "nvptx-prec-divf32", cl::Hidden,
85     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
86              " IEEE Compliant F32 div.rnd if available."),
87     cl::init(2));
88 
89 static cl::opt<bool> UsePrecSqrtF32(
90     "nvptx-prec-sqrtf32", cl::Hidden,
91     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
92     cl::init(true));
93 
94 static cl::opt<bool> ForceMinByValParamAlign(
95     "nvptx-force-min-byval-param-align", cl::Hidden,
96     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
97              " params of device functions."),
98     cl::init(false));
99 
100 int NVPTXTargetLowering::getDivF32Level() const {
101   if (UsePrecDivF32.getNumOccurrences() > 0) {
102     // If nvptx-prec-div32=N is used on the command-line, always honor it
103     return UsePrecDivF32;
104   } else {
105     // Otherwise, use div.approx if fast math is enabled
106     if (getTargetMachine().Options.UnsafeFPMath)
107       return 0;
108     else
109       return 2;
110   }
111 }
112 
113 bool NVPTXTargetLowering::usePrecSqrtF32() const {
114   if (UsePrecSqrtF32.getNumOccurrences() > 0) {
115     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
116     return UsePrecSqrtF32;
117   } else {
118     // Otherwise, use sqrt.approx if fast math is enabled
119     return !getTargetMachine().Options.UnsafeFPMath;
120   }
121 }
122 
123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
124   return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
125          DenormalMode::PreserveSign;
126 }
127 
128 static bool IsPTXVectorType(MVT VT) {
129   switch (VT.SimpleTy) {
130   default:
131     return false;
132   case MVT::v2i1:
133   case MVT::v4i1:
134   case MVT::v2i8:
135   case MVT::v4i8:
136   case MVT::v2i16:
137   case MVT::v4i16:
138   case MVT::v8i16: // <4 x i16x2>
139   case MVT::v2i32:
140   case MVT::v4i32:
141   case MVT::v2i64:
142   case MVT::v2f16:
143   case MVT::v4f16:
144   case MVT::v8f16: // <4 x f16x2>
145   case MVT::v2bf16:
146   case MVT::v4bf16:
147   case MVT::v8bf16: // <4 x bf16x2>
148   case MVT::v2f32:
149   case MVT::v4f32:
150   case MVT::v2f64:
151     return true;
152   }
153 }
154 
155 static bool Is16bitsType(MVT VT) {
156   return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
157           VT.SimpleTy == MVT::i16);
158 }
159 
160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
161 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
162 /// into their primitive components.
163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
165 /// LowerCall, and LowerReturn.
166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
167                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
168                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
169                                uint64_t StartingOffset = 0) {
170   SmallVector<EVT, 16> TempVTs;
171   SmallVector<uint64_t, 16> TempOffsets;
172 
173   // Special case for i128 - decompose to (i64, i64)
174   if (Ty->isIntegerTy(128)) {
175     ValueVTs.push_back(EVT(MVT::i64));
176     ValueVTs.push_back(EVT(MVT::i64));
177 
178     if (Offsets) {
179       Offsets->push_back(StartingOffset + 0);
180       Offsets->push_back(StartingOffset + 8);
181     }
182 
183     return;
184   }
185 
186   // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
187   if (StructType *STy = dyn_cast<StructType>(Ty)) {
188     auto const *SL = DL.getStructLayout(STy);
189     auto ElementNum = 0;
190     for(auto *EI : STy->elements()) {
191       ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
192                          StartingOffset + SL->getElementOffset(ElementNum));
193       ++ElementNum;
194     }
195     return;
196   }
197 
198   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
199   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
200     EVT VT = TempVTs[i];
201     uint64_t Off = TempOffsets[i];
202     // Split vectors into individual elements, except for v2f16, which
203     // we will pass as a single scalar.
204     if (VT.isVector()) {
205       unsigned NumElts = VT.getVectorNumElements();
206       EVT EltVT = VT.getVectorElementType();
207       // Vectors with an even number of f16 elements will be passed to
208       // us as an array of v2f16/v2bf16 elements. We must match this so we
209       // stay in sync with Ins/Outs.
210       if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
211         switch (EltVT.getSimpleVT().SimpleTy) {
212         case MVT::f16:
213           EltVT = MVT::v2f16;
214           break;
215         case MVT::bf16:
216           EltVT = MVT::v2bf16;
217           break;
218         case MVT::i16:
219           EltVT = MVT::v2i16;
220           break;
221         default:
222           llvm_unreachable("Unexpected type");
223         }
224         NumElts /= 2;
225       } else if (EltVT.getSimpleVT() == MVT::i8 &&
226                  (NumElts % 4 == 0 || NumElts == 3)) {
227         // v*i8 are formally lowered as v4i8
228         EltVT = MVT::v4i8;
229         NumElts = (NumElts + 3) / 4;
230       }
231       for (unsigned j = 0; j != NumElts; ++j) {
232         ValueVTs.push_back(EltVT);
233         if (Offsets)
234           Offsets->push_back(Off + j * EltVT.getStoreSize());
235       }
236     } else {
237       ValueVTs.push_back(VT);
238       if (Offsets)
239         Offsets->push_back(Off);
240     }
241   }
242 }
243 
244 /// PromoteScalarIntegerPTX
245 /// Used to make sure the arguments/returns are suitable for passing
246 /// and promote them to a larger size if they're not.
247 ///
248 /// The promoted type is placed in \p PromoteVT if the function returns true.
249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
250   if (VT.isScalarInteger()) {
251     switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
252     default:
253       llvm_unreachable(
254           "Promotion is not suitable for scalars of size larger than 64-bits");
255     case 1:
256       *PromotedVT = MVT::i1;
257       break;
258     case 2:
259     case 4:
260     case 8:
261       *PromotedVT = MVT::i8;
262       break;
263     case 16:
264       *PromotedVT = MVT::i16;
265       break;
266     case 32:
267       *PromotedVT = MVT::i32;
268       break;
269     case 64:
270       *PromotedVT = MVT::i64;
271       break;
272     }
273     return EVT(*PromotedVT) != VT;
274   }
275   return false;
276 }
277 
278 // Check whether we can merge loads/stores of some of the pieces of a
279 // flattened function parameter or return value into a single vector
280 // load/store.
281 //
282 // The flattened parameter is represented as a list of EVTs and
283 // offsets, and the whole structure is aligned to ParamAlignment. This
284 // function determines whether we can load/store pieces of the
285 // parameter starting at index Idx using a single vectorized op of
286 // size AccessSize. If so, it returns the number of param pieces
287 // covered by the vector op. Otherwise, it returns 1.
288 static unsigned CanMergeParamLoadStoresStartingAt(
289     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
290     const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
291 
292   // Can't vectorize if param alignment is not sufficient.
293   if (ParamAlignment < AccessSize)
294     return 1;
295   // Can't vectorize if offset is not aligned.
296   if (Offsets[Idx] & (AccessSize - 1))
297     return 1;
298 
299   EVT EltVT = ValueVTs[Idx];
300   unsigned EltSize = EltVT.getStoreSize();
301 
302   // Element is too large to vectorize.
303   if (EltSize >= AccessSize)
304     return 1;
305 
306   unsigned NumElts = AccessSize / EltSize;
307   // Can't vectorize if AccessBytes if not a multiple of EltSize.
308   if (AccessSize != EltSize * NumElts)
309     return 1;
310 
311   // We don't have enough elements to vectorize.
312   if (Idx + NumElts > ValueVTs.size())
313     return 1;
314 
315   // PTX ISA can only deal with 2- and 4-element vector ops.
316   if (NumElts != 4 && NumElts != 2)
317     return 1;
318 
319   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
320     // Types do not match.
321     if (ValueVTs[j] != EltVT)
322       return 1;
323 
324     // Elements are not contiguous.
325     if (Offsets[j] - Offsets[j - 1] != EltSize)
326       return 1;
327   }
328   // OK. We can vectorize ValueVTs[i..i+NumElts)
329   return NumElts;
330 }
331 
332 // Flags for tracking per-element vectorization state of loads/stores
333 // of a flattened function parameter or return value.
334 enum ParamVectorizationFlags {
335   PVF_INNER = 0x0, // Middle elements of a vector.
336   PVF_FIRST = 0x1, // First element of the vector.
337   PVF_LAST = 0x2,  // Last element of the vector.
338   // Scalar is effectively a 1-element vector.
339   PVF_SCALAR = PVF_FIRST | PVF_LAST
340 };
341 
342 // Computes whether and how we can vectorize the loads/stores of a
343 // flattened function parameter or return value.
344 //
345 // The flattened parameter is represented as the list of ValueVTs and
346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
347 // of the same size as ValueVTs indicating how each piece should be
348 // loaded/stored (i.e. as a scalar, or as part of a vector
349 // load/store).
350 static SmallVector<ParamVectorizationFlags, 16>
351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
352                      const SmallVectorImpl<uint64_t> &Offsets,
353                      Align ParamAlignment, bool IsVAArg = false) {
354   // Set vector size to match ValueVTs and mark all elements as
355   // scalars by default.
356   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
357   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
358 
359   if (IsVAArg)
360     return VectorInfo;
361 
362   // Check what we can vectorize using 128/64/32-bit accesses.
363   for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
364     // Skip elements we've already processed.
365     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
366     for (unsigned AccessSize : {16, 8, 4, 2}) {
367       unsigned NumElts = CanMergeParamLoadStoresStartingAt(
368           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
369       // Mark vectorized elements.
370       switch (NumElts) {
371       default:
372         llvm_unreachable("Unexpected return value");
373       case 1:
374         // Can't vectorize using this size, try next smaller size.
375         continue;
376       case 2:
377         assert(I + 1 < E && "Not enough elements.");
378         VectorInfo[I] = PVF_FIRST;
379         VectorInfo[I + 1] = PVF_LAST;
380         I += 1;
381         break;
382       case 4:
383         assert(I + 3 < E && "Not enough elements.");
384         VectorInfo[I] = PVF_FIRST;
385         VectorInfo[I + 1] = PVF_INNER;
386         VectorInfo[I + 2] = PVF_INNER;
387         VectorInfo[I + 3] = PVF_LAST;
388         I += 3;
389         break;
390       }
391       // Break out of the inner loop because we've already succeeded
392       // using largest possible AccessSize.
393       break;
394     }
395   }
396   return VectorInfo;
397 }
398 
399 // NVPTXTargetLowering Constructor.
400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
401                                          const NVPTXSubtarget &STI)
402     : TargetLowering(TM), nvTM(&TM), STI(STI) {
403   // always lower memset, memcpy, and memmove intrinsics to load/store
404   // instructions, rather
405   // then generating calls to memset, mempcy or memmove.
406   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
407   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
408   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
409 
410   setBooleanContents(ZeroOrNegativeOneBooleanContent);
411   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
412 
413   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
414   // condition branches.
415   setJumpIsExpensive(true);
416 
417   // Wide divides are _very_ slow. Try to reduce the width of the divide if
418   // possible.
419   addBypassSlowDiv(64, 32);
420 
421   // By default, use the Source scheduling
422   if (sched4reg)
423     setSchedulingPreference(Sched::RegPressure);
424   else
425     setSchedulingPreference(Sched::Source);
426 
427   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
428                                     LegalizeAction NoF16Action) {
429     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
430   };
431 
432   auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
433                                     LegalizeAction NoBF16Action) {
434     bool IsOpSupported = STI.hasBF16Math();
435     // Few instructions are available on sm_90 only
436     switch(Op) {
437       case ISD::FADD:
438       case ISD::FMUL:
439       case ISD::FSUB:
440       case ISD::SELECT:
441       case ISD::SELECT_CC:
442       case ISD::SETCC:
443       case ISD::FEXP2:
444       case ISD::FCEIL:
445       case ISD::FFLOOR:
446       case ISD::FNEARBYINT:
447       case ISD::FRINT:
448       case ISD::FTRUNC:
449         IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
450         break;
451     }
452     setOperationAction(
453         Op, VT, IsOpSupported ? Action : NoBF16Action);
454   };
455 
456   auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
457                                      LegalizeAction NoI16x2Action) {
458     bool IsOpSupported = false;
459     // instructions are available on sm_90 only
460     switch (Op) {
461     case ISD::ADD:
462     case ISD::SMAX:
463     case ISD::SMIN:
464     case ISD::UMIN:
465     case ISD::UMAX:
466     case ISD::SUB:
467       IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
468       break;
469     }
470     setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
471   };
472 
473   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
474   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
475   addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
476   addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
477   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
478   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
479   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
480   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
481   addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
482   addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
483   addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
484   addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
485 
486   // Conversion to/from FP16/FP16x2 is always legal.
487   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
488   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
489   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
490   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
491 
492   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
493   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
494 
495   // Conversion to/from BFP16/BFP16x2 is always legal.
496   setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
497   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
498   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
499   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
500 
501   setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
502   setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
503   if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
504     AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
505 
506   // Conversion to/from i16/i16x2 is always legal.
507   setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
508   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
509   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
510   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
511 
512   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
513   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
514   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
515   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
516   // Only logical ops can be done on v4i8 directly, others must be done
517   // elementwise.
518   setOperationAction(
519       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
520        ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,
521        ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,
522        ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,
523        ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,
524        ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,
525        ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,
526        ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,
527        ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,
528        ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,
529        ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,
530        ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,
531        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
532        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
533        ISD::USUBSAT},
534       MVT::v4i8, Expand);
535 
536   // Operations not directly supported by NVPTX.
537   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
538                  MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
539                  MVT::i32, MVT::i64}) {
540     setOperationAction(ISD::SELECT_CC, VT, Expand);
541     setOperationAction(ISD::BR_CC, VT, Expand);
542   }
543 
544   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
545   // For others we will expand to a SHL/SRA pair.
546   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
547   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
548   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
549   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
550   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
551   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
552 
553   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
554   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
555   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
556   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
557   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
558   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
559 
560   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
561   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
562 
563   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
564   // that don't have h/w rotation we lower them to multi-instruction assembly.
565   // See ROT*_sw in NVPTXIntrInfo.td
566   setOperationAction(ISD::ROTL, MVT::i64, Legal);
567   setOperationAction(ISD::ROTR, MVT::i64, Legal);
568   setOperationAction(ISD::ROTL, MVT::i32, Legal);
569   setOperationAction(ISD::ROTR, MVT::i32, Legal);
570 
571   setOperationAction(ISD::ROTL, MVT::i16, Expand);
572   setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
573   setOperationAction(ISD::ROTR, MVT::i16, Expand);
574   setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
575   setOperationAction(ISD::ROTL, MVT::i8, Expand);
576   setOperationAction(ISD::ROTR, MVT::i8, Expand);
577   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
578   setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
579   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
580   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
581 
582   // Indirect branch is not supported.
583   // This also disables Jump Table creation.
584   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
585   setOperationAction(ISD::BRIND, MVT::Other, Expand);
586 
587   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
588   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
589 
590   // We want to legalize constant related memmove and memcopy
591   // intrinsics.
592   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
593 
594   // Turn FP extload into load/fpextend
595   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
596   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
597   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
598   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
599   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
600   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
601   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
602   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
603   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
604   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
605   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
606   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
607   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
608   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
609   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
610   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
611   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
612   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
613   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
614   // Turn FP truncstore into trunc + store.
615   // FIXME: vector types should also be expanded
616   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
617   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
618   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
619   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
620   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
621 
622   // PTX does not support load / store predicate registers
623   setOperationAction(ISD::LOAD, MVT::i1, Custom);
624   setOperationAction(ISD::STORE, MVT::i1, Custom);
625 
626   for (MVT VT : MVT::integer_valuetypes()) {
627     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
628     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
629     setTruncStoreAction(VT, MVT::i1, Expand);
630   }
631 
632   // expand extload of vector of integers.
633   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
634                    MVT::v2i8, Expand);
635   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
636 
637   // This is legal in NVPTX
638   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
639   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
640   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
641   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
642 
643   // Lowering of DYNAMIC_STACKALLOC is unsupported.
644   // Custom lower to produce an error.
645   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
646   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
647 
648   // TRAP can be lowered to PTX trap
649   setOperationAction(ISD::TRAP, MVT::Other, Legal);
650 
651   // Register custom handling for vector loads/stores
652   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
653     if (IsPTXVectorType(VT)) {
654       setOperationAction(ISD::LOAD, VT, Custom);
655       setOperationAction(ISD::STORE, VT, Custom);
656       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
657     }
658   }
659 
660   // Support varargs.
661   setOperationAction(ISD::VASTART, MVT::Other, Custom);
662   setOperationAction(ISD::VAARG, MVT::Other, Custom);
663   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
664   setOperationAction(ISD::VAEND, MVT::Other, Expand);
665 
666   // Custom handling for i8 intrinsics
667   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
668 
669   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
670     setOperationAction(ISD::ABS,  Ty, Legal);
671     setOperationAction(ISD::SMIN, Ty, Legal);
672     setOperationAction(ISD::SMAX, Ty, Legal);
673     setOperationAction(ISD::UMIN, Ty, Legal);
674     setOperationAction(ISD::UMAX, Ty, Legal);
675 
676     setOperationAction(ISD::CTPOP, Ty, Legal);
677     setOperationAction(ISD::CTLZ, Ty, Legal);
678   }
679 
680   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
681   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
682   setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
683   setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
684   setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
685   setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
686   setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
687 
688   setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
689   setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
690   setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
691   setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
692   setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
693   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
694 
695   // Other arithmetic and logic ops are unsupported.
696   setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
697                       ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
698                       ISD::SINT_TO_FP, ISD::UINT_TO_FP},
699                      MVT::v2i16, Expand);
700 
701   setOperationAction(ISD::ADDC, MVT::i32, Legal);
702   setOperationAction(ISD::ADDE, MVT::i32, Legal);
703   setOperationAction(ISD::SUBC, MVT::i32, Legal);
704   setOperationAction(ISD::SUBE, MVT::i32, Legal);
705   if (STI.getPTXVersion() >= 43) {
706     setOperationAction(ISD::ADDC, MVT::i64, Legal);
707     setOperationAction(ISD::ADDE, MVT::i64, Legal);
708     setOperationAction(ISD::SUBC, MVT::i64, Legal);
709     setOperationAction(ISD::SUBE, MVT::i64, Legal);
710   }
711 
712   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
713   setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
714   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
715   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
716 
717   // PTX does not directly support SELP of i1, so promote to i32 first
718   setOperationAction(ISD::SELECT, MVT::i1, Custom);
719 
720   // PTX cannot multiply two i64s in a single instruction.
721   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
722   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
723 
724   // We have some custom DAG combine patterns for these nodes
725   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
726                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
727                        ISD::VSELECT});
728 
729   // setcc for f16x2 and bf16x2 needs special handling to prevent
730   // legalizer's attempt to scalarize it due to v2i1 not being legal.
731   if (STI.allowFP16Math() || STI.hasBF16Math())
732     setTargetDAGCombine(ISD::SETCC);
733 
734   // Promote fp16 arithmetic if fp16 hardware isn't available or the
735   // user passed --nvptx-no-fp16-math. The flag is useful because,
736   // although sm_53+ GPUs have some sort of FP16 support in
737   // hardware, only sm_53 and sm_60 have full implementation. Others
738   // only have token amount of hardware and are likely to run faster
739   // by using fp32 units instead.
740   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
741     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
742     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
743     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
744     // bf16 must be promoted to f32.
745     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
746     if (getOperationAction(Op, MVT::bf16) == Promote)
747       AddPromotedToType(Op, MVT::bf16, MVT::f32);
748   }
749 
750   // f16/f16x2 neg was introduced in PTX 60, SM_53.
751   const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
752                                         STI.getPTXVersion() >= 60 &&
753                                         STI.allowFP16Math();
754   for (const auto &VT : {MVT::f16, MVT::v2f16})
755     setOperationAction(ISD::FNEG, VT,
756                        IsFP16FP16x2NegAvailable ? Legal : Expand);
757 
758   setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
759   setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
760   // (would be) Library functions.
761 
762   // These map to conversion instructions for scalar FP types.
763   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
764                          ISD::FROUNDEVEN, ISD::FTRUNC}) {
765     setOperationAction(Op, MVT::f16, Legal);
766     setOperationAction(Op, MVT::f32, Legal);
767     setOperationAction(Op, MVT::f64, Legal);
768     setOperationAction(Op, MVT::v2f16, Expand);
769     setOperationAction(Op, MVT::v2bf16, Expand);
770     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
771     if (getOperationAction(Op, MVT::bf16) == Promote)
772       AddPromotedToType(Op, MVT::bf16, MVT::f32);
773   }
774 
775   // sm_80 only has conversions between f32 and bf16. Custom lower all other
776   // bf16 conversions.
777   if (STI.hasBF16Math() &&
778       (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
779     for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
780       setOperationAction(
781           {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
782           VT, Custom);
783     }
784   }
785 
786   setOperationAction(ISD::FROUND, MVT::f16, Promote);
787   setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
788   setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
789   setOperationAction(ISD::FROUND, MVT::f32, Custom);
790   setOperationAction(ISD::FROUND, MVT::f64, Custom);
791   setOperationAction(ISD::FROUND, MVT::bf16, Promote);
792   AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
793 
794   // 'Expand' implements FCOPYSIGN without calling an external library.
795   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
796   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
797   setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
798   setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
799   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
800   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
801 
802   // These map to corresponding instructions for f32/f64. f16 must be
803   // promoted to f32. v2f16 is expanded to f16, which is then promoted
804   // to f32.
805   for (const auto &Op :
806        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
807     setOperationAction(Op, MVT::f16, Promote);
808     setOperationAction(Op, MVT::f32, Legal);
809     setOperationAction(Op, MVT::f64, Legal);
810     setOperationAction(Op, MVT::v2f16, Expand);
811     setOperationAction(Op, MVT::v2bf16, Expand);
812     setOperationAction(Op, MVT::bf16, Promote);
813     AddPromotedToType(Op, MVT::bf16, MVT::f32);
814   }
815   for (const auto &Op : {ISD::FABS}) {
816     setOperationAction(Op, MVT::f16, Promote);
817     setOperationAction(Op, MVT::f32, Legal);
818     setOperationAction(Op, MVT::f64, Legal);
819     setOperationAction(Op, MVT::v2f16, Expand);
820     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
821     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
822     if (getOperationAction(Op, MVT::bf16) == Promote)
823       AddPromotedToType(Op, MVT::bf16, MVT::f32);
824   }
825 
826   // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
827   auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
828     bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
829     return IsAtLeastSm80 ? Legal : NotSm80Action;
830   };
831   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
832     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
833     setOperationAction(Op, MVT::f32, Legal);
834     setOperationAction(Op, MVT::f64, Legal);
835     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
836     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
837     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
838     if (getOperationAction(Op, MVT::bf16) == Promote)
839       AddPromotedToType(Op, MVT::bf16, MVT::f32);
840   }
841   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
842     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
843     setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
844     setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
845     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
846     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
847   }
848 
849   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
850   // No FPOW or FREM in PTX.
851 
852   // Now deduce the information based on the above mentioned
853   // actions
854   computeRegisterProperties(STI.getRegisterInfo());
855 
856   setMinCmpXchgSizeInBits(32);
857   setMaxAtomicSizeInBitsSupported(64);
858 }
859 
860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
861   switch ((NVPTXISD::NodeType)Opcode) {
862   case NVPTXISD::FIRST_NUMBER:
863     break;
864   case NVPTXISD::CALL:
865     return "NVPTXISD::CALL";
866   case NVPTXISD::RET_GLUE:
867     return "NVPTXISD::RET_GLUE";
868   case NVPTXISD::LOAD_PARAM:
869     return "NVPTXISD::LOAD_PARAM";
870   case NVPTXISD::Wrapper:
871     return "NVPTXISD::Wrapper";
872   case NVPTXISD::DeclareParam:
873     return "NVPTXISD::DeclareParam";
874   case NVPTXISD::DeclareScalarParam:
875     return "NVPTXISD::DeclareScalarParam";
876   case NVPTXISD::DeclareRet:
877     return "NVPTXISD::DeclareRet";
878   case NVPTXISD::DeclareScalarRet:
879     return "NVPTXISD::DeclareScalarRet";
880   case NVPTXISD::DeclareRetParam:
881     return "NVPTXISD::DeclareRetParam";
882   case NVPTXISD::PrintCall:
883     return "NVPTXISD::PrintCall";
884   case NVPTXISD::PrintConvergentCall:
885     return "NVPTXISD::PrintConvergentCall";
886   case NVPTXISD::PrintCallUni:
887     return "NVPTXISD::PrintCallUni";
888   case NVPTXISD::PrintConvergentCallUni:
889     return "NVPTXISD::PrintConvergentCallUni";
890   case NVPTXISD::LoadParam:
891     return "NVPTXISD::LoadParam";
892   case NVPTXISD::LoadParamV2:
893     return "NVPTXISD::LoadParamV2";
894   case NVPTXISD::LoadParamV4:
895     return "NVPTXISD::LoadParamV4";
896   case NVPTXISD::StoreParam:
897     return "NVPTXISD::StoreParam";
898   case NVPTXISD::StoreParamV2:
899     return "NVPTXISD::StoreParamV2";
900   case NVPTXISD::StoreParamV4:
901     return "NVPTXISD::StoreParamV4";
902   case NVPTXISD::StoreParamS32:
903     return "NVPTXISD::StoreParamS32";
904   case NVPTXISD::StoreParamU32:
905     return "NVPTXISD::StoreParamU32";
906   case NVPTXISD::CallArgBegin:
907     return "NVPTXISD::CallArgBegin";
908   case NVPTXISD::CallArg:
909     return "NVPTXISD::CallArg";
910   case NVPTXISD::LastCallArg:
911     return "NVPTXISD::LastCallArg";
912   case NVPTXISD::CallArgEnd:
913     return "NVPTXISD::CallArgEnd";
914   case NVPTXISD::CallVoid:
915     return "NVPTXISD::CallVoid";
916   case NVPTXISD::CallVal:
917     return "NVPTXISD::CallVal";
918   case NVPTXISD::CallSymbol:
919     return "NVPTXISD::CallSymbol";
920   case NVPTXISD::Prototype:
921     return "NVPTXISD::Prototype";
922   case NVPTXISD::MoveParam:
923     return "NVPTXISD::MoveParam";
924   case NVPTXISD::StoreRetval:
925     return "NVPTXISD::StoreRetval";
926   case NVPTXISD::StoreRetvalV2:
927     return "NVPTXISD::StoreRetvalV2";
928   case NVPTXISD::StoreRetvalV4:
929     return "NVPTXISD::StoreRetvalV4";
930   case NVPTXISD::PseudoUseParam:
931     return "NVPTXISD::PseudoUseParam";
932   case NVPTXISD::RETURN:
933     return "NVPTXISD::RETURN";
934   case NVPTXISD::CallSeqBegin:
935     return "NVPTXISD::CallSeqBegin";
936   case NVPTXISD::CallSeqEnd:
937     return "NVPTXISD::CallSeqEnd";
938   case NVPTXISD::CallPrototype:
939     return "NVPTXISD::CallPrototype";
940   case NVPTXISD::ProxyReg:
941     return "NVPTXISD::ProxyReg";
942   case NVPTXISD::LoadV2:
943     return "NVPTXISD::LoadV2";
944   case NVPTXISD::LoadV4:
945     return "NVPTXISD::LoadV4";
946   case NVPTXISD::LDGV2:
947     return "NVPTXISD::LDGV2";
948   case NVPTXISD::LDGV4:
949     return "NVPTXISD::LDGV4";
950   case NVPTXISD::LDUV2:
951     return "NVPTXISD::LDUV2";
952   case NVPTXISD::LDUV4:
953     return "NVPTXISD::LDUV4";
954   case NVPTXISD::StoreV2:
955     return "NVPTXISD::StoreV2";
956   case NVPTXISD::StoreV4:
957     return "NVPTXISD::StoreV4";
958   case NVPTXISD::FUN_SHFL_CLAMP:
959     return "NVPTXISD::FUN_SHFL_CLAMP";
960   case NVPTXISD::FUN_SHFR_CLAMP:
961     return "NVPTXISD::FUN_SHFR_CLAMP";
962   case NVPTXISD::IMAD:
963     return "NVPTXISD::IMAD";
964   case NVPTXISD::BFE:
965     return "NVPTXISD::BFE";
966   case NVPTXISD::BFI:
967     return "NVPTXISD::BFI";
968   case NVPTXISD::PRMT:
969     return "NVPTXISD::PRMT";
970   case NVPTXISD::SETP_F16X2:
971     return "NVPTXISD::SETP_F16X2";
972   case NVPTXISD::SETP_BF16X2:
973     return "NVPTXISD::SETP_BF16X2";
974   case NVPTXISD::Dummy:
975     return "NVPTXISD::Dummy";
976   case NVPTXISD::MUL_WIDE_SIGNED:
977     return "NVPTXISD::MUL_WIDE_SIGNED";
978   case NVPTXISD::MUL_WIDE_UNSIGNED:
979     return "NVPTXISD::MUL_WIDE_UNSIGNED";
980   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
981   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
982   case NVPTXISD::Tex1DFloatFloatLevel:
983     return "NVPTXISD::Tex1DFloatFloatLevel";
984   case NVPTXISD::Tex1DFloatFloatGrad:
985     return "NVPTXISD::Tex1DFloatFloatGrad";
986   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
987   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
988   case NVPTXISD::Tex1DS32FloatLevel:
989     return "NVPTXISD::Tex1DS32FloatLevel";
990   case NVPTXISD::Tex1DS32FloatGrad:
991     return "NVPTXISD::Tex1DS32FloatGrad";
992   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
993   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
994   case NVPTXISD::Tex1DU32FloatLevel:
995     return "NVPTXISD::Tex1DU32FloatLevel";
996   case NVPTXISD::Tex1DU32FloatGrad:
997     return "NVPTXISD::Tex1DU32FloatGrad";
998   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
999   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
1000   case NVPTXISD::Tex1DArrayFloatFloatLevel:
1001     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
1002   case NVPTXISD::Tex1DArrayFloatFloatGrad:
1003     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
1004   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
1005   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
1006   case NVPTXISD::Tex1DArrayS32FloatLevel:
1007     return "NVPTXISD::Tex1DArrayS32FloatLevel";
1008   case NVPTXISD::Tex1DArrayS32FloatGrad:
1009     return "NVPTXISD::Tex1DArrayS32FloatGrad";
1010   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
1011   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
1012   case NVPTXISD::Tex1DArrayU32FloatLevel:
1013     return "NVPTXISD::Tex1DArrayU32FloatLevel";
1014   case NVPTXISD::Tex1DArrayU32FloatGrad:
1015     return "NVPTXISD::Tex1DArrayU32FloatGrad";
1016   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
1017   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
1018   case NVPTXISD::Tex2DFloatFloatLevel:
1019     return "NVPTXISD::Tex2DFloatFloatLevel";
1020   case NVPTXISD::Tex2DFloatFloatGrad:
1021     return "NVPTXISD::Tex2DFloatFloatGrad";
1022   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
1023   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
1024   case NVPTXISD::Tex2DS32FloatLevel:
1025     return "NVPTXISD::Tex2DS32FloatLevel";
1026   case NVPTXISD::Tex2DS32FloatGrad:
1027     return "NVPTXISD::Tex2DS32FloatGrad";
1028   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
1029   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
1030   case NVPTXISD::Tex2DU32FloatLevel:
1031     return "NVPTXISD::Tex2DU32FloatLevel";
1032   case NVPTXISD::Tex2DU32FloatGrad:
1033     return "NVPTXISD::Tex2DU32FloatGrad";
1034   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
1035   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
1036   case NVPTXISD::Tex2DArrayFloatFloatLevel:
1037     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
1038   case NVPTXISD::Tex2DArrayFloatFloatGrad:
1039     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
1040   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
1041   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
1042   case NVPTXISD::Tex2DArrayS32FloatLevel:
1043     return "NVPTXISD::Tex2DArrayS32FloatLevel";
1044   case NVPTXISD::Tex2DArrayS32FloatGrad:
1045     return "NVPTXISD::Tex2DArrayS32FloatGrad";
1046   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
1047   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
1048   case NVPTXISD::Tex2DArrayU32FloatLevel:
1049     return "NVPTXISD::Tex2DArrayU32FloatLevel";
1050   case NVPTXISD::Tex2DArrayU32FloatGrad:
1051     return "NVPTXISD::Tex2DArrayU32FloatGrad";
1052   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
1053   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
1054   case NVPTXISD::Tex3DFloatFloatLevel:
1055     return "NVPTXISD::Tex3DFloatFloatLevel";
1056   case NVPTXISD::Tex3DFloatFloatGrad:
1057     return "NVPTXISD::Tex3DFloatFloatGrad";
1058   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
1059   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
1060   case NVPTXISD::Tex3DS32FloatLevel:
1061     return "NVPTXISD::Tex3DS32FloatLevel";
1062   case NVPTXISD::Tex3DS32FloatGrad:
1063     return "NVPTXISD::Tex3DS32FloatGrad";
1064   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
1065   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
1066   case NVPTXISD::Tex3DU32FloatLevel:
1067     return "NVPTXISD::Tex3DU32FloatLevel";
1068   case NVPTXISD::Tex3DU32FloatGrad:
1069     return "NVPTXISD::Tex3DU32FloatGrad";
1070   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
1071   case NVPTXISD::TexCubeFloatFloatLevel:
1072     return "NVPTXISD::TexCubeFloatFloatLevel";
1073   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
1074   case NVPTXISD::TexCubeS32FloatLevel:
1075     return "NVPTXISD::TexCubeS32FloatLevel";
1076   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
1077   case NVPTXISD::TexCubeU32FloatLevel:
1078     return "NVPTXISD::TexCubeU32FloatLevel";
1079   case NVPTXISD::TexCubeArrayFloatFloat:
1080     return "NVPTXISD::TexCubeArrayFloatFloat";
1081   case NVPTXISD::TexCubeArrayFloatFloatLevel:
1082     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
1083   case NVPTXISD::TexCubeArrayS32Float:
1084     return "NVPTXISD::TexCubeArrayS32Float";
1085   case NVPTXISD::TexCubeArrayS32FloatLevel:
1086     return "NVPTXISD::TexCubeArrayS32FloatLevel";
1087   case NVPTXISD::TexCubeArrayU32Float:
1088     return "NVPTXISD::TexCubeArrayU32Float";
1089   case NVPTXISD::TexCubeArrayU32FloatLevel:
1090     return "NVPTXISD::TexCubeArrayU32FloatLevel";
1091   case NVPTXISD::Tld4R2DFloatFloat:
1092     return "NVPTXISD::Tld4R2DFloatFloat";
1093   case NVPTXISD::Tld4G2DFloatFloat:
1094     return "NVPTXISD::Tld4G2DFloatFloat";
1095   case NVPTXISD::Tld4B2DFloatFloat:
1096     return "NVPTXISD::Tld4B2DFloatFloat";
1097   case NVPTXISD::Tld4A2DFloatFloat:
1098     return "NVPTXISD::Tld4A2DFloatFloat";
1099   case NVPTXISD::Tld4R2DS64Float:
1100     return "NVPTXISD::Tld4R2DS64Float";
1101   case NVPTXISD::Tld4G2DS64Float:
1102     return "NVPTXISD::Tld4G2DS64Float";
1103   case NVPTXISD::Tld4B2DS64Float:
1104     return "NVPTXISD::Tld4B2DS64Float";
1105   case NVPTXISD::Tld4A2DS64Float:
1106     return "NVPTXISD::Tld4A2DS64Float";
1107   case NVPTXISD::Tld4R2DU64Float:
1108     return "NVPTXISD::Tld4R2DU64Float";
1109   case NVPTXISD::Tld4G2DU64Float:
1110     return "NVPTXISD::Tld4G2DU64Float";
1111   case NVPTXISD::Tld4B2DU64Float:
1112     return "NVPTXISD::Tld4B2DU64Float";
1113   case NVPTXISD::Tld4A2DU64Float:
1114     return "NVPTXISD::Tld4A2DU64Float";
1115 
1116   case NVPTXISD::TexUnified1DFloatS32:
1117     return "NVPTXISD::TexUnified1DFloatS32";
1118   case NVPTXISD::TexUnified1DFloatFloat:
1119     return "NVPTXISD::TexUnified1DFloatFloat";
1120   case NVPTXISD::TexUnified1DFloatFloatLevel:
1121     return "NVPTXISD::TexUnified1DFloatFloatLevel";
1122   case NVPTXISD::TexUnified1DFloatFloatGrad:
1123     return "NVPTXISD::TexUnified1DFloatFloatGrad";
1124   case NVPTXISD::TexUnified1DS32S32:
1125     return "NVPTXISD::TexUnified1DS32S32";
1126   case NVPTXISD::TexUnified1DS32Float:
1127     return "NVPTXISD::TexUnified1DS32Float";
1128   case NVPTXISD::TexUnified1DS32FloatLevel:
1129     return "NVPTXISD::TexUnified1DS32FloatLevel";
1130   case NVPTXISD::TexUnified1DS32FloatGrad:
1131     return "NVPTXISD::TexUnified1DS32FloatGrad";
1132   case NVPTXISD::TexUnified1DU32S32:
1133     return "NVPTXISD::TexUnified1DU32S32";
1134   case NVPTXISD::TexUnified1DU32Float:
1135     return "NVPTXISD::TexUnified1DU32Float";
1136   case NVPTXISD::TexUnified1DU32FloatLevel:
1137     return "NVPTXISD::TexUnified1DU32FloatLevel";
1138   case NVPTXISD::TexUnified1DU32FloatGrad:
1139     return "NVPTXISD::TexUnified1DU32FloatGrad";
1140   case NVPTXISD::TexUnified1DArrayFloatS32:
1141     return "NVPTXISD::TexUnified1DArrayFloatS32";
1142   case NVPTXISD::TexUnified1DArrayFloatFloat:
1143     return "NVPTXISD::TexUnified1DArrayFloatFloat";
1144   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
1145     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
1146   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
1147     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
1148   case NVPTXISD::TexUnified1DArrayS32S32:
1149     return "NVPTXISD::TexUnified1DArrayS32S32";
1150   case NVPTXISD::TexUnified1DArrayS32Float:
1151     return "NVPTXISD::TexUnified1DArrayS32Float";
1152   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
1153     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
1154   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
1155     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
1156   case NVPTXISD::TexUnified1DArrayU32S32:
1157     return "NVPTXISD::TexUnified1DArrayU32S32";
1158   case NVPTXISD::TexUnified1DArrayU32Float:
1159     return "NVPTXISD::TexUnified1DArrayU32Float";
1160   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
1161     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
1162   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
1163     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
1164   case NVPTXISD::TexUnified2DFloatS32:
1165     return "NVPTXISD::TexUnified2DFloatS32";
1166   case NVPTXISD::TexUnified2DFloatFloat:
1167     return "NVPTXISD::TexUnified2DFloatFloat";
1168   case NVPTXISD::TexUnified2DFloatFloatLevel:
1169     return "NVPTXISD::TexUnified2DFloatFloatLevel";
1170   case NVPTXISD::TexUnified2DFloatFloatGrad:
1171     return "NVPTXISD::TexUnified2DFloatFloatGrad";
1172   case NVPTXISD::TexUnified2DS32S32:
1173     return "NVPTXISD::TexUnified2DS32S32";
1174   case NVPTXISD::TexUnified2DS32Float:
1175     return "NVPTXISD::TexUnified2DS32Float";
1176   case NVPTXISD::TexUnified2DS32FloatLevel:
1177     return "NVPTXISD::TexUnified2DS32FloatLevel";
1178   case NVPTXISD::TexUnified2DS32FloatGrad:
1179     return "NVPTXISD::TexUnified2DS32FloatGrad";
1180   case NVPTXISD::TexUnified2DU32S32:
1181     return "NVPTXISD::TexUnified2DU32S32";
1182   case NVPTXISD::TexUnified2DU32Float:
1183     return "NVPTXISD::TexUnified2DU32Float";
1184   case NVPTXISD::TexUnified2DU32FloatLevel:
1185     return "NVPTXISD::TexUnified2DU32FloatLevel";
1186   case NVPTXISD::TexUnified2DU32FloatGrad:
1187     return "NVPTXISD::TexUnified2DU32FloatGrad";
1188   case NVPTXISD::TexUnified2DArrayFloatS32:
1189     return "NVPTXISD::TexUnified2DArrayFloatS32";
1190   case NVPTXISD::TexUnified2DArrayFloatFloat:
1191     return "NVPTXISD::TexUnified2DArrayFloatFloat";
1192   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
1193     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
1194   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
1195     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
1196   case NVPTXISD::TexUnified2DArrayS32S32:
1197     return "NVPTXISD::TexUnified2DArrayS32S32";
1198   case NVPTXISD::TexUnified2DArrayS32Float:
1199     return "NVPTXISD::TexUnified2DArrayS32Float";
1200   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
1201     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
1202   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
1203     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
1204   case NVPTXISD::TexUnified2DArrayU32S32:
1205     return "NVPTXISD::TexUnified2DArrayU32S32";
1206   case NVPTXISD::TexUnified2DArrayU32Float:
1207     return "NVPTXISD::TexUnified2DArrayU32Float";
1208   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
1209     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
1210   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
1211     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
1212   case NVPTXISD::TexUnified3DFloatS32:
1213     return "NVPTXISD::TexUnified3DFloatS32";
1214   case NVPTXISD::TexUnified3DFloatFloat:
1215     return "NVPTXISD::TexUnified3DFloatFloat";
1216   case NVPTXISD::TexUnified3DFloatFloatLevel:
1217     return "NVPTXISD::TexUnified3DFloatFloatLevel";
1218   case NVPTXISD::TexUnified3DFloatFloatGrad:
1219     return "NVPTXISD::TexUnified3DFloatFloatGrad";
1220   case NVPTXISD::TexUnified3DS32S32:
1221     return "NVPTXISD::TexUnified3DS32S32";
1222   case NVPTXISD::TexUnified3DS32Float:
1223     return "NVPTXISD::TexUnified3DS32Float";
1224   case NVPTXISD::TexUnified3DS32FloatLevel:
1225     return "NVPTXISD::TexUnified3DS32FloatLevel";
1226   case NVPTXISD::TexUnified3DS32FloatGrad:
1227     return "NVPTXISD::TexUnified3DS32FloatGrad";
1228   case NVPTXISD::TexUnified3DU32S32:
1229     return "NVPTXISD::TexUnified3DU32S32";
1230   case NVPTXISD::TexUnified3DU32Float:
1231     return "NVPTXISD::TexUnified3DU32Float";
1232   case NVPTXISD::TexUnified3DU32FloatLevel:
1233     return "NVPTXISD::TexUnified3DU32FloatLevel";
1234   case NVPTXISD::TexUnified3DU32FloatGrad:
1235     return "NVPTXISD::TexUnified3DU32FloatGrad";
1236   case NVPTXISD::TexUnifiedCubeFloatFloat:
1237     return "NVPTXISD::TexUnifiedCubeFloatFloat";
1238   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
1239     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1240   case NVPTXISD::TexUnifiedCubeS32Float:
1241     return "NVPTXISD::TexUnifiedCubeS32Float";
1242   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
1243     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1244   case NVPTXISD::TexUnifiedCubeU32Float:
1245     return "NVPTXISD::TexUnifiedCubeU32Float";
1246   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
1247     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1248   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
1249     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1250   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
1251     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1252   case NVPTXISD::TexUnifiedCubeArrayS32Float:
1253     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1254   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
1255     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1256   case NVPTXISD::TexUnifiedCubeArrayU32Float:
1257     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1258   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
1259     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1260   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
1261     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1262   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
1263     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1264   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
1265     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1266   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
1267     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1268   case NVPTXISD::Tld4UnifiedR2DS64Float:
1269     return "NVPTXISD::Tld4UnifiedR2DS64Float";
1270   case NVPTXISD::Tld4UnifiedG2DS64Float:
1271     return "NVPTXISD::Tld4UnifiedG2DS64Float";
1272   case NVPTXISD::Tld4UnifiedB2DS64Float:
1273     return "NVPTXISD::Tld4UnifiedB2DS64Float";
1274   case NVPTXISD::Tld4UnifiedA2DS64Float:
1275     return "NVPTXISD::Tld4UnifiedA2DS64Float";
1276   case NVPTXISD::Tld4UnifiedR2DU64Float:
1277     return "NVPTXISD::Tld4UnifiedR2DU64Float";
1278   case NVPTXISD::Tld4UnifiedG2DU64Float:
1279     return "NVPTXISD::Tld4UnifiedG2DU64Float";
1280   case NVPTXISD::Tld4UnifiedB2DU64Float:
1281     return "NVPTXISD::Tld4UnifiedB2DU64Float";
1282   case NVPTXISD::Tld4UnifiedA2DU64Float:
1283     return "NVPTXISD::Tld4UnifiedA2DU64Float";
1284 
1285   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1286   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1287   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1288   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1289   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1290   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1291   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1292   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1293   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1294   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1295   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1296 
1297   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1298   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1299   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1300   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1301   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1302   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1303   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1304   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1305   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1306   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1307   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1308 
1309   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1310   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1311   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1312   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1313   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1314   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1315   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1316   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1317   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1318   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1319   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1320 
1321   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1322   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1323   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1324   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1325   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1326   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1327   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1328   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1329   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1330   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1331   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1332 
1333   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1334   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1335   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1336   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1337   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1338   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1339   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1340   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1341   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1342   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1343   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1344 
1345   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1346   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1347   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1348   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1349   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1350   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1351   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1352   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1353   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1354   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1355   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1356 
1357   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1358   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1359   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1360   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1361   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1362   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1363   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1364   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1365   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1366   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1367   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1368 
1369   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1370   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1371   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1372   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1373   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1374   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1375   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1376   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1377   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1378   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1379   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1380 
1381   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1382   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1383   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1384   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1385   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1386   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1387   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1388   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1389   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1390   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1391   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1392 
1393   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1394   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1395   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1396   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1397   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1398   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1399   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1400   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1401   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1402   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1403   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1404 
1405   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1406   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1407   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1408   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1409   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1410   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1411   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1412   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1413   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1414   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1415   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1416 
1417   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1418   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1419   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1420   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1421   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1422   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1423   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1424   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1425   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1426   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1427   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1428 
1429   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1430   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1431   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1432   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1433   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1434   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1435   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1436   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1437   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1438   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1439   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1440 
1441   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1442   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1443   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1444   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1445   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1446   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1447   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1448   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1449   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1450   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1451   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1452 
1453   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1454   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1455   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1456   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1457   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1458   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1459   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1460   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1461   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1462   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1463   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1464   }
1465   return nullptr;
1466 }
1467 
1468 TargetLoweringBase::LegalizeTypeAction
1469 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1470   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1471       VT.getScalarType() == MVT::i1)
1472     return TypeSplitVector;
1473   if (Isv2x16VT(VT))
1474     return TypeLegal;
1475   return TargetLoweringBase::getPreferredVectorAction(VT);
1476 }
1477 
1478 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1479                                              int Enabled, int &ExtraSteps,
1480                                              bool &UseOneConst,
1481                                              bool Reciprocal) const {
1482   if (!(Enabled == ReciprocalEstimate::Enabled ||
1483         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1484     return SDValue();
1485 
1486   if (ExtraSteps == ReciprocalEstimate::Unspecified)
1487     ExtraSteps = 0;
1488 
1489   SDLoc DL(Operand);
1490   EVT VT = Operand.getValueType();
1491   bool Ftz = useF32FTZ(DAG.getMachineFunction());
1492 
1493   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1494     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1495                        DAG.getConstant(IID, DL, MVT::i32), Operand);
1496   };
1497 
1498   // The sqrt and rsqrt refinement processes assume we always start out with an
1499   // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1500   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1501   // any refinement, we must return a regular sqrt.
1502   if (Reciprocal || ExtraSteps > 0) {
1503     if (VT == MVT::f32)
1504       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1505                                    : Intrinsic::nvvm_rsqrt_approx_f);
1506     else if (VT == MVT::f64)
1507       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1508     else
1509       return SDValue();
1510   } else {
1511     if (VT == MVT::f32)
1512       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1513                                    : Intrinsic::nvvm_sqrt_approx_f);
1514     else {
1515       // There's no sqrt.approx.f64 instruction, so we emit
1516       // reciprocal(rsqrt(x)).  This is faster than
1517       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1518       // x * rsqrt(x).)
1519       return DAG.getNode(
1520           ISD::INTRINSIC_WO_CHAIN, DL, VT,
1521           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1522           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1523     }
1524   }
1525 }
1526 
1527 SDValue
1528 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1529   SDLoc dl(Op);
1530   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1531   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1532   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1533   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1534 }
1535 
1536 static bool IsTypePassedAsArray(const Type *Ty) {
1537   return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1538          Ty->isHalfTy() || Ty->isBFloatTy();
1539 }
1540 
1541 std::string NVPTXTargetLowering::getPrototype(
1542     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1543     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1544     std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1545     const CallBase &CB, unsigned UniqueCallSite) const {
1546   auto PtrVT = getPointerTy(DL);
1547 
1548   bool isABI = (STI.getSmVersion() >= 20);
1549   assert(isABI && "Non-ABI compilation is not supported");
1550   if (!isABI)
1551     return "";
1552 
1553   std::string Prototype;
1554   raw_string_ostream O(Prototype);
1555   O << "prototype_" << UniqueCallSite << " : .callprototype ";
1556 
1557   if (retTy->getTypeID() == Type::VoidTyID) {
1558     O << "()";
1559   } else {
1560     O << "(";
1561     if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1562         !IsTypePassedAsArray(retTy)) {
1563       unsigned size = 0;
1564       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1565         size = ITy->getBitWidth();
1566       } else {
1567         assert(retTy->isFloatingPointTy() &&
1568                "Floating point type expected here");
1569         size = retTy->getPrimitiveSizeInBits();
1570       }
1571       // PTX ABI requires all scalar return values to be at least 32
1572       // bits in size.  fp16 normally uses .b16 as its storage type in
1573       // PTX, so its size must be adjusted here, too.
1574       size = promoteScalarArgumentSize(size);
1575 
1576       O << ".param .b" << size << " _";
1577     } else if (isa<PointerType>(retTy)) {
1578       O << ".param .b" << PtrVT.getSizeInBits() << " _";
1579     } else if (IsTypePassedAsArray(retTy)) {
1580       O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1581         << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1582     } else {
1583       llvm_unreachable("Unknown return type");
1584     }
1585     O << ") ";
1586   }
1587   O << "_ (";
1588 
1589   bool first = true;
1590 
1591   const Function *F = CB.getFunction();
1592   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1593   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1594     Type *Ty = Args[i].Ty;
1595     if (!first) {
1596       O << ", ";
1597     }
1598     first = false;
1599 
1600     if (!Outs[OIdx].Flags.isByVal()) {
1601       if (IsTypePassedAsArray(Ty)) {
1602         unsigned ParamAlign = 0;
1603         const CallInst *CallI = cast<CallInst>(&CB);
1604         // +1 because index 0 is reserved for return type alignment
1605         if (!getAlign(*CallI, i + 1, ParamAlign))
1606           ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1607         O << ".param .align " << ParamAlign << " .b8 ";
1608         O << "_";
1609         O << "[" << DL.getTypeAllocSize(Ty) << "]";
1610         // update the index for Outs
1611         SmallVector<EVT, 16> vtparts;
1612         ComputeValueVTs(*this, DL, Ty, vtparts);
1613         if (unsigned len = vtparts.size())
1614           OIdx += len - 1;
1615         continue;
1616       }
1617       // i8 types in IR will be i16 types in SDAG
1618       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1619               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1620              "type mismatch between callee prototype and arguments");
1621       // scalar type
1622       unsigned sz = 0;
1623       if (isa<IntegerType>(Ty)) {
1624         sz = cast<IntegerType>(Ty)->getBitWidth();
1625         sz = promoteScalarArgumentSize(sz);
1626       } else if (isa<PointerType>(Ty)) {
1627         sz = PtrVT.getSizeInBits();
1628       } else {
1629         sz = Ty->getPrimitiveSizeInBits();
1630       }
1631       O << ".param .b" << sz << " ";
1632       O << "_";
1633       continue;
1634     }
1635 
1636     Type *ETy = Args[i].IndirectType;
1637     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1638     Align ParamByValAlign =
1639         getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1640 
1641     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1642     O << "_";
1643     O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1644   }
1645 
1646   if (VAInfo)
1647     O << (first ? "" : ",") << " .param .align " << VAInfo->second
1648       << " .b8 _[]\n";
1649   O << ")";
1650   if (shouldEmitPTXNoReturn(&CB, *nvTM))
1651     O << " .noreturn";
1652   O << ";";
1653 
1654   return Prototype;
1655 }
1656 
1657 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1658                                                 const CallBase *CB, Type *Ty,
1659                                                 unsigned Idx,
1660                                                 const DataLayout &DL) const {
1661   if (!CB) {
1662     // CallSite is zero, fallback to ABI type alignment
1663     return DL.getABITypeAlign(Ty);
1664   }
1665 
1666   unsigned Alignment = 0;
1667   const Function *DirectCallee = CB->getCalledFunction();
1668 
1669   if (!DirectCallee) {
1670     // We don't have a direct function symbol, but that may be because of
1671     // constant cast instructions in the call.
1672 
1673     // With bitcast'd call targets, the instruction will be the call
1674     if (const auto *CI = dyn_cast<CallInst>(CB)) {
1675       // Check if we have call alignment metadata
1676       if (getAlign(*CI, Idx, Alignment))
1677         return Align(Alignment);
1678     }
1679     DirectCallee = getMaybeBitcastedCallee(CB);
1680   }
1681 
1682   // Check for function alignment information if we found that the
1683   // ultimate target is a Function
1684   if (DirectCallee) {
1685     if (getAlign(*DirectCallee, Idx, Alignment))
1686       return Align(Alignment);
1687     // If alignment information is not available, fall back to the
1688     // default function param optimized type alignment
1689     return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1690   }
1691 
1692   // Call is indirect, fall back to the ABI type alignment
1693   return DL.getABITypeAlign(Ty);
1694 }
1695 
1696 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1697                                        SmallVectorImpl<SDValue> &InVals) const {
1698 
1699   if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1700     report_fatal_error(
1701         "Support for variadic functions (unsized array parameter) introduced "
1702         "in PTX ISA version 6.0 and requires target sm_30.");
1703 
1704   SelectionDAG &DAG = CLI.DAG;
1705   SDLoc dl = CLI.DL;
1706   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1707   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1708   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1709   SDValue Chain = CLI.Chain;
1710   SDValue Callee = CLI.Callee;
1711   bool &isTailCall = CLI.IsTailCall;
1712   ArgListTy &Args = CLI.getArgs();
1713   Type *RetTy = CLI.RetTy;
1714   const CallBase *CB = CLI.CB;
1715   const DataLayout &DL = DAG.getDataLayout();
1716 
1717   bool isABI = (STI.getSmVersion() >= 20);
1718   assert(isABI && "Non-ABI compilation is not supported");
1719   if (!isABI)
1720     return Chain;
1721 
1722   // Variadic arguments.
1723   //
1724   // Normally, for each argument, we declare a param scalar or a param
1725   // byte array in the .param space, and store the argument value to that
1726   // param scalar or array starting at offset 0.
1727   //
1728   // In the case of the first variadic argument, we declare a vararg byte array
1729   // with size 0. The exact size of this array isn't known at this point, so
1730   // it'll be patched later. All the variadic arguments will be stored to this
1731   // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1732   // initially set to 0, so it can be used for non-variadic arguments (which use
1733   // 0 offset) to simplify the code.
1734   //
1735   // After all vararg is processed, 'VAOffset' holds the size of the
1736   // vararg byte array.
1737 
1738   SDValue VADeclareParam;                 // vararg byte array
1739   unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1740   unsigned VAOffset = 0;                  // current offset in the param array
1741 
1742   unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1743   SDValue TempChain = Chain;
1744   Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1745   SDValue InGlue = Chain.getValue(1);
1746 
1747   unsigned ParamCount = 0;
1748   // Args.size() and Outs.size() need not match.
1749   // Outs.size() will be larger
1750   //   * if there is an aggregate argument with multiple fields (each field
1751   //     showing up separately in Outs)
1752   //   * if there is a vector argument with more than typical vector-length
1753   //     elements (generally if more than 4) where each vector element is
1754   //     individually present in Outs.
1755   // So a different index should be used for indexing into Outs/OutVals.
1756   // See similar issue in LowerFormalArguments.
1757   unsigned OIdx = 0;
1758   // Declare the .params or .reg need to pass values
1759   // to the function
1760   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1761     EVT VT = Outs[OIdx].VT;
1762     Type *Ty = Args[i].Ty;
1763     bool IsVAArg = (i >= CLI.NumFixedArgs);
1764     bool IsByVal = Outs[OIdx].Flags.isByVal();
1765 
1766     SmallVector<EVT, 16> VTs;
1767     SmallVector<uint64_t, 16> Offsets;
1768 
1769     assert((!IsByVal || Args[i].IndirectType) &&
1770            "byval arg must have indirect type");
1771     Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1772     ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1773 
1774     Align ArgAlign;
1775     if (IsByVal) {
1776       // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1777       // so we don't need to worry whether it's naturally aligned or not.
1778       // See TargetLowering::LowerCallTo().
1779       Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1780       ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1781                                             InitialAlign, DL);
1782       if (IsVAArg)
1783         VAOffset = alignTo(VAOffset, ArgAlign);
1784     } else {
1785       ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
1786     }
1787 
1788     unsigned TypeSize =
1789         (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1790     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1791 
1792     bool NeedAlign; // Does argument declaration specify alignment?
1793     bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1794     if (IsVAArg) {
1795       if (ParamCount == FirstVAArg) {
1796         SDValue DeclareParamOps[] = {
1797             Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1798             DAG.getConstant(ParamCount, dl, MVT::i32),
1799             DAG.getConstant(1, dl, MVT::i32), InGlue};
1800         VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1801                                              DeclareParamVTs, DeclareParamOps);
1802       }
1803       NeedAlign = PassAsArray;
1804     } else if (PassAsArray) {
1805       // declare .param .align <align> .b8 .param<n>[<size>];
1806       SDValue DeclareParamOps[] = {
1807           Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1808           DAG.getConstant(ParamCount, dl, MVT::i32),
1809           DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1810       Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1811                           DeclareParamOps);
1812       NeedAlign = true;
1813     } else {
1814       // declare .param .b<size> .param<n>;
1815       if (VT.isInteger() || VT.isFloatingPoint()) {
1816         // PTX ABI requires integral types to be at least 32 bits in
1817         // size. FP16 is loaded/stored using i16, so it's handled
1818         // here as well.
1819         TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1820       }
1821       SDValue DeclareScalarParamOps[] = {
1822           Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1823           DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1824           DAG.getConstant(0, dl, MVT::i32), InGlue};
1825       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1826                           DeclareScalarParamOps);
1827       NeedAlign = false;
1828     }
1829     InGlue = Chain.getValue(1);
1830 
1831     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1832     // than 32-bits are sign extended or zero extended, depending on
1833     // whether they are signed or unsigned types. This case applies
1834     // only to scalar parameters and not to aggregate values.
1835     bool ExtendIntegerParam =
1836         Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1837 
1838     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1839     SmallVector<SDValue, 6> StoreOperands;
1840     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1841       EVT EltVT = VTs[j];
1842       int CurOffset = Offsets[j];
1843       MaybeAlign PartAlign;
1844       if (NeedAlign)
1845         PartAlign = commonAlignment(ArgAlign, CurOffset);
1846 
1847       // New store.
1848       if (VectorInfo[j] & PVF_FIRST) {
1849         assert(StoreOperands.empty() && "Unfinished preceding store.");
1850         StoreOperands.push_back(Chain);
1851         StoreOperands.push_back(
1852             DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1853         StoreOperands.push_back(DAG.getConstant(
1854             IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1855             dl, MVT::i32));
1856       }
1857 
1858       SDValue StVal = OutVals[OIdx];
1859 
1860       MVT PromotedVT;
1861       if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1862         EltVT = EVT(PromotedVT);
1863       }
1864       if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1865         llvm::ISD::NodeType Ext =
1866             Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1867         StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1868       }
1869 
1870       if (IsByVal) {
1871         auto PtrVT = getPointerTy(DL);
1872         SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1873                                       DAG.getConstant(CurOffset, dl, PtrVT));
1874         StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1875                             PartAlign);
1876       } else if (ExtendIntegerParam) {
1877         assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1878         // zext/sext to i32
1879         StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1880                                                       : ISD::ZERO_EXTEND,
1881                             dl, MVT::i32, StVal);
1882       }
1883 
1884       if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1885         // Use 16-bit registers for small stores as it's the
1886         // smallest general purpose register size supported by NVPTX.
1887         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1888       }
1889 
1890       // Record the value to store.
1891       StoreOperands.push_back(StVal);
1892 
1893       if (VectorInfo[j] & PVF_LAST) {
1894         unsigned NumElts = StoreOperands.size() - 3;
1895         NVPTXISD::NodeType Op;
1896         switch (NumElts) {
1897         case 1:
1898           Op = NVPTXISD::StoreParam;
1899           break;
1900         case 2:
1901           Op = NVPTXISD::StoreParamV2;
1902           break;
1903         case 4:
1904           Op = NVPTXISD::StoreParamV4;
1905           break;
1906         default:
1907           llvm_unreachable("Invalid vector info.");
1908         }
1909 
1910         StoreOperands.push_back(InGlue);
1911 
1912         // Adjust type of the store op if we've extended the scalar
1913         // return value.
1914         EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1915 
1916         Chain = DAG.getMemIntrinsicNode(
1917             Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1918             TheStoreType, MachinePointerInfo(), PartAlign,
1919             MachineMemOperand::MOStore);
1920         InGlue = Chain.getValue(1);
1921 
1922         // Cleanup.
1923         StoreOperands.clear();
1924 
1925         // TODO: We may need to support vector types that can be passed
1926         // as scalars in variadic arguments.
1927         if (!IsByVal && IsVAArg) {
1928           assert(NumElts == 1 &&
1929                  "Vectorization is expected to be disabled for variadics.");
1930           VAOffset += DL.getTypeAllocSize(
1931               TheStoreType.getTypeForEVT(*DAG.getContext()));
1932         }
1933       }
1934       if (!IsByVal)
1935         ++OIdx;
1936     }
1937     assert(StoreOperands.empty() && "Unfinished parameter store.");
1938     if (!IsByVal && VTs.size() > 0)
1939       --OIdx;
1940     ++ParamCount;
1941     if (IsByVal && IsVAArg)
1942       VAOffset += TypeSize;
1943   }
1944 
1945   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1946   MaybeAlign retAlignment = std::nullopt;
1947 
1948   // Handle Result
1949   if (Ins.size() > 0) {
1950     SmallVector<EVT, 16> resvtparts;
1951     ComputeValueVTs(*this, DL, RetTy, resvtparts);
1952 
1953     // Declare
1954     //  .param .align N .b8 retval0[<size-in-bytes>], or
1955     //  .param .b<size-in-bits> retval0
1956     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1957     if (!IsTypePassedAsArray(RetTy)) {
1958       resultsz = promoteScalarArgumentSize(resultsz);
1959       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1960       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1961                                   DAG.getConstant(resultsz, dl, MVT::i32),
1962                                   DAG.getConstant(0, dl, MVT::i32), InGlue };
1963       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1964                           DeclareRetOps);
1965       InGlue = Chain.getValue(1);
1966     } else {
1967       retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1968       assert(retAlignment && "retAlignment is guaranteed to be set");
1969       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1970       SDValue DeclareRetOps[] = {
1971           Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1972           DAG.getConstant(resultsz / 8, dl, MVT::i32),
1973           DAG.getConstant(0, dl, MVT::i32), InGlue};
1974       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1975                           DeclareRetOps);
1976       InGlue = Chain.getValue(1);
1977     }
1978   }
1979 
1980   bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1981   // Set the size of the vararg param byte array if the callee is a variadic
1982   // function and the variadic part is not empty.
1983   if (HasVAArgs) {
1984     SDValue DeclareParamOps[] = {
1985         VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1986         VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1987         VADeclareParam.getOperand(4)};
1988     DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1989                     VADeclareParam->getVTList(), DeclareParamOps);
1990   }
1991 
1992   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1993   // between them we must rely on the call site value which is valid for
1994   // indirect calls but is always null for libcalls.
1995   bool isIndirectCall = !Func && CB;
1996 
1997   if (isa<ExternalSymbolSDNode>(Callee)) {
1998     Function* CalleeFunc = nullptr;
1999 
2000     // Try to find the callee in the current module.
2001     Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
2002     assert(CalleeFunc != nullptr && "Libcall callee must be set.");
2003 
2004     // Set the "libcall callee" attribute to indicate that the function
2005     // must always have a declaration.
2006     CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
2007   }
2008 
2009   if (isIndirectCall) {
2010     // This is indirect function call case : PTX requires a prototype of the
2011     // form
2012     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
2013     // to be emitted, and the label has to used as the last arg of call
2014     // instruction.
2015     // The prototype is embedded in a string and put as the operand for a
2016     // CallPrototype SDNode which will print out to the value of the string.
2017     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2018     std::string Proto = getPrototype(
2019         DL, RetTy, Args, Outs, retAlignment,
2020         HasVAArgs
2021             ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
2022                   CLI.NumFixedArgs,
2023                   cast<ConstantSDNode>(VADeclareParam->getOperand(1))
2024                       ->getAPIntValue()))
2025             : std::nullopt,
2026         *CB, UniqueCallSite);
2027     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
2028     SDValue ProtoOps[] = {
2029         Chain,
2030         DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
2031         InGlue,
2032     };
2033     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
2034     InGlue = Chain.getValue(1);
2035   }
2036   // Op to just print "call"
2037   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2038   SDValue PrintCallOps[] = {
2039     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2040   };
2041   // We model convergent calls as separate opcodes.
2042   unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
2043   if (CLI.IsConvergent)
2044     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2045                                               : NVPTXISD::PrintConvergentCall;
2046   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2047   InGlue = Chain.getValue(1);
2048 
2049   // Ops to print out the function name
2050   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2051   SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2052   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2053   InGlue = Chain.getValue(1);
2054 
2055   // Ops to print out the param list
2056   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2057   SDValue CallArgBeginOps[] = { Chain, InGlue };
2058   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2059                       CallArgBeginOps);
2060   InGlue = Chain.getValue(1);
2061 
2062   for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2063        ++i) {
2064     unsigned opcode;
2065     if (i == (e - 1))
2066       opcode = NVPTXISD::LastCallArg;
2067     else
2068       opcode = NVPTXISD::CallArg;
2069     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2070     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2071                              DAG.getConstant(i, dl, MVT::i32), InGlue };
2072     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2073     InGlue = Chain.getValue(1);
2074   }
2075   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2076   SDValue CallArgEndOps[] = { Chain,
2077                               DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2078                               InGlue };
2079   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2080   InGlue = Chain.getValue(1);
2081 
2082   if (isIndirectCall) {
2083     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2084     SDValue PrototypeOps[] = {
2085         Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2086     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2087     InGlue = Chain.getValue(1);
2088   }
2089 
2090   SmallVector<SDValue, 16> ProxyRegOps;
2091   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2092 
2093   // Generate loads from param memory/moves from registers for result
2094   if (Ins.size() > 0) {
2095     SmallVector<EVT, 16> VTs;
2096     SmallVector<uint64_t, 16> Offsets;
2097     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2098     assert(VTs.size() == Ins.size() && "Bad value decomposition");
2099 
2100     Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
2101     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2102 
2103     SmallVector<EVT, 6> LoadVTs;
2104     int VecIdx = -1; // Index of the first element of the vector.
2105 
2106     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2107     // 32-bits are sign extended or zero extended, depending on whether
2108     // they are signed or unsigned types.
2109     bool ExtendIntegerRetVal =
2110         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2111 
2112     for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2113       bool needTruncate = false;
2114       EVT TheLoadType = VTs[i];
2115       EVT EltType = Ins[i].VT;
2116       Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2117       MVT PromotedVT;
2118 
2119       if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2120         TheLoadType = EVT(PromotedVT);
2121         EltType = EVT(PromotedVT);
2122         needTruncate = true;
2123       }
2124 
2125       if (ExtendIntegerRetVal) {
2126         TheLoadType = MVT::i32;
2127         EltType = MVT::i32;
2128         needTruncate = true;
2129       } else if (TheLoadType.getSizeInBits() < 16) {
2130         if (VTs[i].isInteger())
2131           needTruncate = true;
2132         EltType = MVT::i16;
2133       }
2134 
2135       // Record index of the very first element of the vector.
2136       if (VectorInfo[i] & PVF_FIRST) {
2137         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2138         VecIdx = i;
2139       }
2140 
2141       LoadVTs.push_back(EltType);
2142 
2143       if (VectorInfo[i] & PVF_LAST) {
2144         unsigned NumElts = LoadVTs.size();
2145         LoadVTs.push_back(MVT::Other);
2146         LoadVTs.push_back(MVT::Glue);
2147         NVPTXISD::NodeType Op;
2148         switch (NumElts) {
2149         case 1:
2150           Op = NVPTXISD::LoadParam;
2151           break;
2152         case 2:
2153           Op = NVPTXISD::LoadParamV2;
2154           break;
2155         case 4:
2156           Op = NVPTXISD::LoadParamV4;
2157           break;
2158         default:
2159           llvm_unreachable("Invalid vector info.");
2160         }
2161 
2162         SDValue LoadOperands[] = {
2163             Chain, DAG.getConstant(1, dl, MVT::i32),
2164             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2165         SDValue RetVal = DAG.getMemIntrinsicNode(
2166             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2167             MachinePointerInfo(), EltAlign,
2168             MachineMemOperand::MOLoad);
2169 
2170         for (unsigned j = 0; j < NumElts; ++j) {
2171           ProxyRegOps.push_back(RetVal.getValue(j));
2172 
2173           if (needTruncate)
2174             ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2175           else
2176             ProxyRegTruncates.push_back(std::optional<MVT>());
2177         }
2178 
2179         Chain = RetVal.getValue(NumElts);
2180         InGlue = RetVal.getValue(NumElts + 1);
2181 
2182         // Cleanup
2183         VecIdx = -1;
2184         LoadVTs.clear();
2185       }
2186     }
2187   }
2188 
2189   Chain =
2190       DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2191   InGlue = Chain.getValue(1);
2192 
2193   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2194   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2195   // dangling.
2196   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2197     SDValue Ret = DAG.getNode(
2198       NVPTXISD::ProxyReg, dl,
2199       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2200       { Chain, ProxyRegOps[i], InGlue }
2201     );
2202 
2203     Chain = Ret.getValue(1);
2204     InGlue = Ret.getValue(2);
2205 
2206     if (ProxyRegTruncates[i]) {
2207       Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2208     }
2209 
2210     InVals.push_back(Ret);
2211   }
2212 
2213   // set isTailCall to false for now, until we figure out how to express
2214   // tail call optimization in PTX
2215   isTailCall = false;
2216   return Chain;
2217 }
2218 
2219 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2220                                                      SelectionDAG &DAG) const {
2221   const Function &Fn = DAG.getMachineFunction().getFunction();
2222 
2223   DiagnosticInfoUnsupported NoDynamicAlloca(
2224       Fn, "dynamic alloca unsupported by NVPTX backend",
2225       SDLoc(Op).getDebugLoc());
2226   DAG.getContext()->diagnose(NoDynamicAlloca);
2227   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
2228   return DAG.getMergeValues(Ops, SDLoc());
2229 }
2230 
2231 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2232 // (see LegalizeDAG.cpp). This is slow and uses local memory.
2233 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2234 SDValue
2235 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2236   SDNode *Node = Op.getNode();
2237   SDLoc dl(Node);
2238   SmallVector<SDValue, 8> Ops;
2239   unsigned NumOperands = Node->getNumOperands();
2240   for (unsigned i = 0; i < NumOperands; ++i) {
2241     SDValue SubOp = Node->getOperand(i);
2242     EVT VVT = SubOp.getNode()->getValueType(0);
2243     EVT EltVT = VVT.getVectorElementType();
2244     unsigned NumSubElem = VVT.getVectorNumElements();
2245     for (unsigned j = 0; j < NumSubElem; ++j) {
2246       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2247                                 DAG.getIntPtrConstant(j, dl)));
2248     }
2249   }
2250   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2251 }
2252 
2253 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
2254 // would get lowered as two constant loads and vector-packing move.
2255 // Instead we want just a constant move:
2256 //        mov.b32         %r2, 0x40003C00
2257 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2258                                                SelectionDAG &DAG) const {
2259   EVT VT = Op->getValueType(0);
2260   if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2261     return Op;
2262 
2263   SDLoc DL(Op);
2264 
2265   if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2266         return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2267                isa<ConstantFPSDNode>(Operand);
2268       })) {
2269     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2270     // to optimize calculation of constant parts.
2271     if (VT == MVT::v4i8) {
2272       SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2273       SDValue E01 = DAG.getNode(
2274           NVPTXISD::BFI, DL, MVT::i32,
2275           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2276           DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2277       SDValue E012 =
2278           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2279                       DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2280                       E01, DAG.getConstant(16, DL, MVT::i32), C8);
2281       SDValue E0123 =
2282           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2283                       DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2284                       E012, DAG.getConstant(24, DL, MVT::i32), C8);
2285       return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2286     }
2287     return Op;
2288   }
2289 
2290   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2291   auto GetOperand = [](SDValue Op, int N) -> APInt {
2292     const SDValue &Operand = Op->getOperand(N);
2293     EVT VT = Op->getValueType(0);
2294     if (Operand->isUndef())
2295       return APInt(32, 0);
2296     APInt Value;
2297     if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2298       Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2299     else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2300       Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
2301     else
2302       llvm_unreachable("Unsupported type");
2303     // i8 values are carried around as i16, so we need to zero out upper bits,
2304     // so they do not get in the way of combining individual byte values
2305     if (VT == MVT::v4i8)
2306       Value = Value.trunc(8);
2307     return Value.zext(32);
2308   };
2309   APInt Value;
2310   if (Isv2x16VT(VT)) {
2311     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2312   } else if (VT == MVT::v4i8) {
2313     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2314             GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2315   } else {
2316     llvm_unreachable("Unsupported type");
2317   }
2318   SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2319   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2320 }
2321 
2322 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2323                                                      SelectionDAG &DAG) const {
2324   SDValue Index = Op->getOperand(1);
2325   SDValue Vector = Op->getOperand(0);
2326   SDLoc DL(Op);
2327   EVT VectorVT = Vector.getValueType();
2328 
2329   if (VectorVT == MVT::v4i8) {
2330     SDValue BFE =
2331         DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2332                     {Vector,
2333                      DAG.getNode(ISD::MUL, DL, MVT::i32,
2334                                  DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2335                                  DAG.getConstant(8, DL, MVT::i32)),
2336                      DAG.getConstant(8, DL, MVT::i32)});
2337     return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2338   }
2339 
2340   // Constant index will be matched by tablegen.
2341   if (isa<ConstantSDNode>(Index.getNode()))
2342     return Op;
2343 
2344   // Extract individual elements and select one of them.
2345   assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2346   EVT EltVT = VectorVT.getVectorElementType();
2347 
2348   SDLoc dl(Op.getNode());
2349   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2350                            DAG.getIntPtrConstant(0, dl));
2351   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2352                            DAG.getIntPtrConstant(1, dl));
2353   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2354                          ISD::CondCode::SETEQ);
2355 }
2356 
2357 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2358                                                     SelectionDAG &DAG) const {
2359   SDValue Vector = Op->getOperand(0);
2360   EVT VectorVT = Vector.getValueType();
2361 
2362   if (VectorVT != MVT::v4i8)
2363     return Op;
2364   SDLoc DL(Op);
2365   SDValue Value = Op->getOperand(1);
2366   if (Value->isUndef())
2367     return Vector;
2368 
2369   SDValue Index = Op->getOperand(2);
2370 
2371   SDValue BFI =
2372       DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2373                   {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2374                    DAG.getNode(ISD::MUL, DL, MVT::i32,
2375                                DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2376                                DAG.getConstant(8, DL, MVT::i32)),
2377                    DAG.getConstant(8, DL, MVT::i32)});
2378   return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2379 }
2380 
2381 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2382                                                  SelectionDAG &DAG) const {
2383   SDValue V1 = Op.getOperand(0);
2384   EVT VectorVT = V1.getValueType();
2385   if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2386     return Op;
2387 
2388   // Lower shuffle to PRMT instruction.
2389   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2390   SDValue V2 = Op.getOperand(1);
2391   uint32_t Selector = 0;
2392   for (auto I : llvm::enumerate(SVN->getMask()))
2393     Selector |= (I.value() << (I.index() * 4));
2394 
2395   SDLoc DL(Op);
2396   return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2397                      DAG.getConstant(Selector, DL, MVT::i32),
2398                      DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2399 }
2400 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2401 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2402 ///    amount, or
2403 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2404 ///    amount.
2405 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2406                                                   SelectionDAG &DAG) const {
2407   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2408   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2409 
2410   EVT VT = Op.getValueType();
2411   unsigned VTBits = VT.getSizeInBits();
2412   SDLoc dl(Op);
2413   SDValue ShOpLo = Op.getOperand(0);
2414   SDValue ShOpHi = Op.getOperand(1);
2415   SDValue ShAmt  = Op.getOperand(2);
2416   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2417 
2418   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2419     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2420     // {dHi, dLo} = {aHi, aLo} >> Amt
2421     //   dHi = aHi >> Amt
2422     //   dLo = shf.r.clamp aLo, aHi, Amt
2423 
2424     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2425     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2426                              ShAmt);
2427 
2428     SDValue Ops[2] = { Lo, Hi };
2429     return DAG.getMergeValues(Ops, dl);
2430   }
2431   else {
2432     // {dHi, dLo} = {aHi, aLo} >> Amt
2433     // - if (Amt>=size) then
2434     //      dLo = aHi >> (Amt-size)
2435     //      dHi = aHi >> Amt (this is either all 0 or all 1)
2436     //   else
2437     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2438     //      dHi = aHi >> Amt
2439 
2440     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2441                                    DAG.getConstant(VTBits, dl, MVT::i32),
2442                                    ShAmt);
2443     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2444     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2445                                      DAG.getConstant(VTBits, dl, MVT::i32));
2446     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2447     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2448     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2449 
2450     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2451                                DAG.getConstant(VTBits, dl, MVT::i32),
2452                                ISD::SETGE);
2453     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2454     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2455 
2456     SDValue Ops[2] = { Lo, Hi };
2457     return DAG.getMergeValues(Ops, dl);
2458   }
2459 }
2460 
2461 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2462 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2463 ///    amount, or
2464 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2465 ///    amount.
2466 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2467                                                  SelectionDAG &DAG) const {
2468   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2469   assert(Op.getOpcode() == ISD::SHL_PARTS);
2470 
2471   EVT VT = Op.getValueType();
2472   unsigned VTBits = VT.getSizeInBits();
2473   SDLoc dl(Op);
2474   SDValue ShOpLo = Op.getOperand(0);
2475   SDValue ShOpHi = Op.getOperand(1);
2476   SDValue ShAmt  = Op.getOperand(2);
2477 
2478   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2479     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2480     // {dHi, dLo} = {aHi, aLo} << Amt
2481     //   dHi = shf.l.clamp aLo, aHi, Amt
2482     //   dLo = aLo << Amt
2483 
2484     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2485                              ShAmt);
2486     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2487 
2488     SDValue Ops[2] = { Lo, Hi };
2489     return DAG.getMergeValues(Ops, dl);
2490   }
2491   else {
2492     // {dHi, dLo} = {aHi, aLo} << Amt
2493     // - if (Amt>=size) then
2494     //      dLo = aLo << Amt (all 0)
2495     //      dLo = aLo << (Amt-size)
2496     //   else
2497     //      dLo = aLo << Amt
2498     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2499 
2500     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2501                                    DAG.getConstant(VTBits, dl, MVT::i32),
2502                                    ShAmt);
2503     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2504     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2505                                      DAG.getConstant(VTBits, dl, MVT::i32));
2506     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2507     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2508     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2509 
2510     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2511                                DAG.getConstant(VTBits, dl, MVT::i32),
2512                                ISD::SETGE);
2513     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2514     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2515 
2516     SDValue Ops[2] = { Lo, Hi };
2517     return DAG.getMergeValues(Ops, dl);
2518   }
2519 }
2520 
2521 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2522   EVT VT = Op.getValueType();
2523 
2524   if (VT == MVT::f32)
2525     return LowerFROUND32(Op, DAG);
2526 
2527   if (VT == MVT::f64)
2528     return LowerFROUND64(Op, DAG);
2529 
2530   llvm_unreachable("unhandled type");
2531 }
2532 
2533 // This is the the rounding method used in CUDA libdevice in C like code:
2534 // float roundf(float A)
2535 // {
2536 //   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2537 //   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2538 //   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2539 // }
2540 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2541                                            SelectionDAG &DAG) const {
2542   SDLoc SL(Op);
2543   SDValue A = Op.getOperand(0);
2544   EVT VT = Op.getValueType();
2545 
2546   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2547 
2548   // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2549   SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2550   const int SignBitMask = 0x80000000;
2551   SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2552                              DAG.getConstant(SignBitMask, SL, MVT::i32));
2553   const int PointFiveInBits = 0x3F000000;
2554   SDValue PointFiveWithSignRaw =
2555       DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2556                   DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2557   SDValue PointFiveWithSign =
2558       DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2559   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2560   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2561 
2562   // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2563   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2564   SDValue IsLarge =
2565       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2566                    ISD::SETOGT);
2567   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2568 
2569   // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2570   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2571                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2572   SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2573   return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2574 }
2575 
2576 // The implementation of round(double) is similar to that of round(float) in
2577 // that they both separate the value range into three regions and use a method
2578 // specific to the region to round the values. However, round(double) first
2579 // calculates the round of the absolute value and then adds the sign back while
2580 // round(float) directly rounds the value with sign.
2581 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2582                                            SelectionDAG &DAG) const {
2583   SDLoc SL(Op);
2584   SDValue A = Op.getOperand(0);
2585   EVT VT = Op.getValueType();
2586 
2587   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2588 
2589   // double RoundedA = (double) (int) (abs(A) + 0.5f);
2590   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2591                                   DAG.getConstantFP(0.5, SL, VT));
2592   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2593 
2594   // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2595   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2596   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2597                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2598   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2599                          DAG.getConstantFP(0, SL, VT),
2600                          RoundedA);
2601 
2602   // Add sign to rounded_A
2603   RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2604   DAG.getNode(ISD::FTRUNC, SL, VT, A);
2605 
2606   // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2607   SDValue IsLarge =
2608       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2609                    ISD::SETOGT);
2610   return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2611 }
2612 
2613 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2614                                             SelectionDAG &DAG) const {
2615   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2616 
2617   if (Op.getValueType() == MVT::bf16) {
2618     SDLoc Loc(Op);
2619     return DAG.getNode(
2620         ISD::FP_ROUND, Loc, MVT::bf16,
2621         DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2622         DAG.getIntPtrConstant(0, Loc));
2623   }
2624 
2625   // Everything else is considered legal.
2626   return Op;
2627 }
2628 
2629 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2630                                             SelectionDAG &DAG) const {
2631   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2632 
2633   if (Op.getOperand(0).getValueType() == MVT::bf16) {
2634     SDLoc Loc(Op);
2635     return DAG.getNode(
2636         Op.getOpcode(), Loc, Op.getValueType(),
2637         DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2638   }
2639 
2640   // Everything else is considered legal.
2641   return Op;
2642 }
2643 
2644 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2645   SDLoc DL(Op);
2646   if (Op.getValueType() != MVT::v2i16)
2647     return Op;
2648   EVT EltVT = Op.getValueType().getVectorElementType();
2649   SmallVector<SDValue> VecElements;
2650   for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2651     SmallVector<SDValue> ScalarArgs;
2652     llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2653                     [&](const SDUse &O) {
2654                       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2655                                          O.get(), DAG.getIntPtrConstant(I, DL));
2656                     });
2657     VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2658   }
2659   SDValue V =
2660       DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2661   return V;
2662 }
2663 
2664 SDValue
2665 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2666   switch (Op.getOpcode()) {
2667   case ISD::RETURNADDR:
2668     return SDValue();
2669   case ISD::FRAMEADDR:
2670     return SDValue();
2671   case ISD::GlobalAddress:
2672     return LowerGlobalAddress(Op, DAG);
2673   case ISD::INTRINSIC_W_CHAIN:
2674     return Op;
2675   case ISD::BUILD_VECTOR:
2676     return LowerBUILD_VECTOR(Op, DAG);
2677   case ISD::EXTRACT_SUBVECTOR:
2678     return Op;
2679   case ISD::EXTRACT_VECTOR_ELT:
2680     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2681   case ISD::INSERT_VECTOR_ELT:
2682     return LowerINSERT_VECTOR_ELT(Op, DAG);
2683   case ISD::VECTOR_SHUFFLE:
2684     return LowerVECTOR_SHUFFLE(Op, DAG);
2685   case ISD::CONCAT_VECTORS:
2686     return LowerCONCAT_VECTORS(Op, DAG);
2687   case ISD::STORE:
2688     return LowerSTORE(Op, DAG);
2689   case ISD::LOAD:
2690     return LowerLOAD(Op, DAG);
2691   case ISD::SHL_PARTS:
2692     return LowerShiftLeftParts(Op, DAG);
2693   case ISD::SRA_PARTS:
2694   case ISD::SRL_PARTS:
2695     return LowerShiftRightParts(Op, DAG);
2696   case ISD::SELECT:
2697     return LowerSelect(Op, DAG);
2698   case ISD::FROUND:
2699     return LowerFROUND(Op, DAG);
2700   case ISD::SINT_TO_FP:
2701   case ISD::UINT_TO_FP:
2702     return LowerINT_TO_FP(Op, DAG);
2703   case ISD::FP_TO_SINT:
2704   case ISD::FP_TO_UINT:
2705     return LowerFP_TO_INT(Op, DAG);
2706   case ISD::VAARG:
2707     return LowerVAARG(Op, DAG);
2708   case ISD::VASTART:
2709     return LowerVASTART(Op, DAG);
2710   case ISD::ABS:
2711   case ISD::SMIN:
2712   case ISD::SMAX:
2713   case ISD::UMIN:
2714   case ISD::UMAX:
2715   case ISD::ADD:
2716   case ISD::SUB:
2717   case ISD::MUL:
2718   case ISD::SHL:
2719   case ISD::SREM:
2720   case ISD::UREM:
2721     return LowerVectorArith(Op, DAG);
2722   case ISD::DYNAMIC_STACKALLOC:
2723     return LowerDYNAMIC_STACKALLOC(Op, DAG);
2724   default:
2725     llvm_unreachable("Custom lowering not defined for operation");
2726   }
2727 }
2728 
2729 // This function is almost a copy of SelectionDAG::expandVAArg().
2730 // The only diff is that this one produces loads from local address space.
2731 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2732   const TargetLowering *TLI = STI.getTargetLowering();
2733   SDLoc DL(Op);
2734 
2735   SDNode *Node = Op.getNode();
2736   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2737   EVT VT = Node->getValueType(0);
2738   auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2739   SDValue Tmp1 = Node->getOperand(0);
2740   SDValue Tmp2 = Node->getOperand(1);
2741   const MaybeAlign MA(Node->getConstantOperandVal(3));
2742 
2743   SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2744                                    Tmp1, Tmp2, MachinePointerInfo(V));
2745   SDValue VAList = VAListLoad;
2746 
2747   if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2748     VAList = DAG.getNode(
2749         ISD::ADD, DL, VAList.getValueType(), VAList,
2750         DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2751 
2752     VAList = DAG.getNode(
2753         ISD::AND, DL, VAList.getValueType(), VAList,
2754         DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2755   }
2756 
2757   // Increment the pointer, VAList, to the next vaarg
2758   Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2759                      DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2760                                      DL, VAList.getValueType()));
2761 
2762   // Store the incremented VAList to the legalized pointer
2763   Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2764                       MachinePointerInfo(V));
2765 
2766   const Value *SrcV =
2767       Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2768 
2769   // Load the actual argument out of the pointer VAList
2770   return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2771 }
2772 
2773 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2774   const TargetLowering *TLI = STI.getTargetLowering();
2775   SDLoc DL(Op);
2776   EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2777 
2778   // Store the address of unsized array <function>_vararg[] in the ap object.
2779   SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2780   SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2781 
2782   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2783   return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2784                       MachinePointerInfo(SV));
2785 }
2786 
2787 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2788   SDValue Op0 = Op->getOperand(0);
2789   SDValue Op1 = Op->getOperand(1);
2790   SDValue Op2 = Op->getOperand(2);
2791   SDLoc DL(Op.getNode());
2792 
2793   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2794 
2795   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2796   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2797   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2798   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2799 
2800   return Trunc;
2801 }
2802 
2803 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2804   if (Op.getValueType() == MVT::i1)
2805     return LowerLOADi1(Op, DAG);
2806 
2807   // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2808   // unaligned loads and have to handle it here.
2809   EVT VT = Op.getValueType();
2810   if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2811     LoadSDNode *Load = cast<LoadSDNode>(Op);
2812     EVT MemVT = Load->getMemoryVT();
2813     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2814                                         MemVT, *Load->getMemOperand())) {
2815       SDValue Ops[2];
2816       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2817       return DAG.getMergeValues(Ops, SDLoc(Op));
2818     }
2819   }
2820 
2821   return SDValue();
2822 }
2823 
2824 // v = ld i1* addr
2825 //   =>
2826 // v1 = ld i8* addr (-> i16)
2827 // v = trunc i16 to i1
2828 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2829   SDNode *Node = Op.getNode();
2830   LoadSDNode *LD = cast<LoadSDNode>(Node);
2831   SDLoc dl(Node);
2832   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2833   assert(Node->getValueType(0) == MVT::i1 &&
2834          "Custom lowering for i1 load only");
2835   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2836                               LD->getPointerInfo(), LD->getAlign(),
2837                               LD->getMemOperand()->getFlags());
2838   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2839   // The legalizer (the caller) is expecting two values from the legalized
2840   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2841   // in LegalizeDAG.cpp which also uses MergeValues.
2842   SDValue Ops[] = { result, LD->getChain() };
2843   return DAG.getMergeValues(Ops, dl);
2844 }
2845 
2846 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2847   StoreSDNode *Store = cast<StoreSDNode>(Op);
2848   EVT VT = Store->getMemoryVT();
2849 
2850   if (VT == MVT::i1)
2851     return LowerSTOREi1(Op, DAG);
2852 
2853   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2854   // stores and have to handle it here.
2855   if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2856       !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2857                                       VT, *Store->getMemOperand()))
2858     return expandUnalignedStore(Store, DAG);
2859 
2860   // v2f16, v2bf16 and v2i16 don't need special handling.
2861   if (Isv2x16VT(VT) || VT == MVT::v4i8)
2862     return SDValue();
2863 
2864   if (VT.isVector())
2865     return LowerSTOREVector(Op, DAG);
2866 
2867   return SDValue();
2868 }
2869 
2870 SDValue
2871 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2872   SDNode *N = Op.getNode();
2873   SDValue Val = N->getOperand(1);
2874   SDLoc DL(N);
2875   EVT ValVT = Val.getValueType();
2876 
2877   if (ValVT.isVector()) {
2878     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2879     // legal.  We can (and should) split that into 2 stores of <2 x double> here
2880     // but I'm leaving that as a TODO for now.
2881     if (!ValVT.isSimple())
2882       return SDValue();
2883     switch (ValVT.getSimpleVT().SimpleTy) {
2884     default:
2885       return SDValue();
2886     case MVT::v2i8:
2887     case MVT::v2i16:
2888     case MVT::v2i32:
2889     case MVT::v2i64:
2890     case MVT::v2f16:
2891     case MVT::v2bf16:
2892     case MVT::v2f32:
2893     case MVT::v2f64:
2894     case MVT::v4i8:
2895     case MVT::v4i16:
2896     case MVT::v4i32:
2897     case MVT::v4f16:
2898     case MVT::v4bf16:
2899     case MVT::v4f32:
2900     case MVT::v8f16: // <4 x f16x2>
2901     case MVT::v8bf16: // <4 x bf16x2>
2902     case MVT::v8i16:  // <4 x i16x2>
2903       // This is a "native" vector type
2904       break;
2905     }
2906 
2907     MemSDNode *MemSD = cast<MemSDNode>(N);
2908     const DataLayout &TD = DAG.getDataLayout();
2909 
2910     Align Alignment = MemSD->getAlign();
2911     Align PrefAlign =
2912         TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2913     if (Alignment < PrefAlign) {
2914       // This store is not sufficiently aligned, so bail out and let this vector
2915       // store be scalarized.  Note that we may still be able to emit smaller
2916       // vector stores.  For example, if we are storing a <4 x float> with an
2917       // alignment of 8, this check will fail but the legalizer will try again
2918       // with 2 x <2 x float>, which will succeed with an alignment of 8.
2919       return SDValue();
2920     }
2921 
2922     unsigned Opcode = 0;
2923     EVT EltVT = ValVT.getVectorElementType();
2924     unsigned NumElts = ValVT.getVectorNumElements();
2925 
2926     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2927     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2928     // stored type to i16 and propagate the "real" type as the memory type.
2929     bool NeedExt = false;
2930     if (EltVT.getSizeInBits() < 16)
2931       NeedExt = true;
2932 
2933     bool StoreF16x2 = false;
2934     switch (NumElts) {
2935     default:
2936       return SDValue();
2937     case 2:
2938       Opcode = NVPTXISD::StoreV2;
2939       break;
2940     case 4:
2941       Opcode = NVPTXISD::StoreV4;
2942       break;
2943     case 8:
2944       // v8f16 is a special case. PTX doesn't have st.v8.f16
2945       // instruction. Instead, we split the vector into v2f16 chunks and
2946       // store them with st.v4.b32.
2947       assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
2948       Opcode = NVPTXISD::StoreV4;
2949       StoreF16x2 = true;
2950       break;
2951     }
2952 
2953     SmallVector<SDValue, 8> Ops;
2954 
2955     // First is the chain
2956     Ops.push_back(N->getOperand(0));
2957 
2958     if (StoreF16x2) {
2959       // Combine f16,f16 -> v2f16
2960       NumElts /= 2;
2961       for (unsigned i = 0; i < NumElts; ++i) {
2962         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2963                                  DAG.getIntPtrConstant(i * 2, DL));
2964         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2965                                  DAG.getIntPtrConstant(i * 2 + 1, DL));
2966         EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
2967         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
2968         Ops.push_back(V2);
2969       }
2970     } else {
2971       // Then the split values
2972       for (unsigned i = 0; i < NumElts; ++i) {
2973         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2974                                      DAG.getIntPtrConstant(i, DL));
2975         if (NeedExt)
2976           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2977         Ops.push_back(ExtVal);
2978       }
2979     }
2980 
2981     // Then any remaining arguments
2982     Ops.append(N->op_begin() + 2, N->op_end());
2983 
2984     SDValue NewSt =
2985         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2986                                 MemSD->getMemoryVT(), MemSD->getMemOperand());
2987 
2988     // return DCI.CombineTo(N, NewSt, true);
2989     return NewSt;
2990   }
2991 
2992   return SDValue();
2993 }
2994 
2995 // st i1 v, addr
2996 //    =>
2997 // v1 = zxt v to i16
2998 // st.u8 i16, addr
2999 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3000   SDNode *Node = Op.getNode();
3001   SDLoc dl(Node);
3002   StoreSDNode *ST = cast<StoreSDNode>(Node);
3003   SDValue Tmp1 = ST->getChain();
3004   SDValue Tmp2 = ST->getBasePtr();
3005   SDValue Tmp3 = ST->getValue();
3006   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3007   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3008   SDValue Result =
3009       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3010                         ST->getAlign(), ST->getMemOperand()->getFlags());
3011   return Result;
3012 }
3013 
3014 // This creates target external symbol for a function parameter.
3015 // Name of the symbol is composed from its index and the function name.
3016 // Negative index corresponds to special parameter (unsized array) used for
3017 // passing variable arguments.
3018 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3019                                             EVT v) const {
3020   StringRef SavedStr = nvTM->getStrPool().save(
3021       getParamName(&DAG.getMachineFunction().getFunction(), idx));
3022   return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3023 }
3024 
3025 SDValue NVPTXTargetLowering::LowerFormalArguments(
3026     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3027     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3028     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3029   MachineFunction &MF = DAG.getMachineFunction();
3030   const DataLayout &DL = DAG.getDataLayout();
3031   auto PtrVT = getPointerTy(DAG.getDataLayout());
3032 
3033   const Function *F = &MF.getFunction();
3034   const AttributeList &PAL = F->getAttributes();
3035   const TargetLowering *TLI = STI.getTargetLowering();
3036 
3037   SDValue Root = DAG.getRoot();
3038   std::vector<SDValue> OutChains;
3039 
3040   bool isABI = (STI.getSmVersion() >= 20);
3041   assert(isABI && "Non-ABI compilation is not supported");
3042   if (!isABI)
3043     return Chain;
3044 
3045   std::vector<Type *> argTypes;
3046   std::vector<const Argument *> theArgs;
3047   for (const Argument &I : F->args()) {
3048     theArgs.push_back(&I);
3049     argTypes.push_back(I.getType());
3050   }
3051   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3052   // Ins.size() will be larger
3053   //   * if there is an aggregate argument with multiple fields (each field
3054   //     showing up separately in Ins)
3055   //   * if there is a vector argument with more than typical vector-length
3056   //     elements (generally if more than 4) where each vector element is
3057   //     individually present in Ins.
3058   // So a different index should be used for indexing into Ins.
3059   // See similar issue in LowerCall.
3060   unsigned InsIdx = 0;
3061 
3062   int idx = 0;
3063   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
3064     Type *Ty = argTypes[i];
3065 
3066     if (theArgs[i]->use_empty()) {
3067       // argument is dead
3068       if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3069         SmallVector<EVT, 16> vtparts;
3070 
3071         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3072         if (vtparts.empty())
3073           report_fatal_error("Empty parameter types are not supported");
3074 
3075         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3076              ++parti) {
3077           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3078           ++InsIdx;
3079         }
3080         if (vtparts.size() > 0)
3081           --InsIdx;
3082         continue;
3083       }
3084       if (Ty->isVectorTy()) {
3085         EVT ObjectVT = getValueType(DL, Ty);
3086         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3087         for (unsigned parti = 0; parti < NumRegs; ++parti) {
3088           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3089           ++InsIdx;
3090         }
3091         if (NumRegs > 0)
3092           --InsIdx;
3093         continue;
3094       }
3095       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3096       continue;
3097     }
3098 
3099     // In the following cases, assign a node order of "idx+1"
3100     // to newly created nodes. The SDNodes for params have to
3101     // appear in the same order as their order of appearance
3102     // in the original function. "idx+1" holds that order.
3103     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3104       bool aggregateIsPacked = false;
3105       if (StructType *STy = dyn_cast<StructType>(Ty))
3106         aggregateIsPacked = STy->isPacked();
3107 
3108       SmallVector<EVT, 16> VTs;
3109       SmallVector<uint64_t, 16> Offsets;
3110       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3111       if (VTs.empty())
3112         report_fatal_error("Empty parameter types are not supported");
3113 
3114       auto VectorInfo =
3115           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
3116 
3117       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3118       int VecIdx = -1; // Index of the first element of the current vector.
3119       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3120         if (VectorInfo[parti] & PVF_FIRST) {
3121           assert(VecIdx == -1 && "Orphaned vector.");
3122           VecIdx = parti;
3123         }
3124 
3125         // That's the last element of this store op.
3126         if (VectorInfo[parti] & PVF_LAST) {
3127           unsigned NumElts = parti - VecIdx + 1;
3128           EVT EltVT = VTs[parti];
3129           // i1 is loaded/stored as i8.
3130           EVT LoadVT = EltVT;
3131           if (EltVT == MVT::i1)
3132             LoadVT = MVT::i8;
3133           else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3134             // getLoad needs a vector type, but it can't handle
3135             // vectors which contain v2f16 or v2bf16 elements. So we must load
3136             // using i32 here and then bitcast back.
3137             LoadVT = MVT::i32;
3138 
3139           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3140           SDValue VecAddr =
3141               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3142                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3143           Value *srcValue = Constant::getNullValue(PointerType::get(
3144               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3145           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3146                                   MachinePointerInfo(srcValue),
3147                                   MaybeAlign(aggregateIsPacked ? 1 : 0),
3148                                   MachineMemOperand::MODereferenceable |
3149                                       MachineMemOperand::MOInvariant);
3150           if (P.getNode())
3151             P.getNode()->setIROrder(idx + 1);
3152           for (unsigned j = 0; j < NumElts; ++j) {
3153             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3154                                       DAG.getIntPtrConstant(j, dl));
3155             // We've loaded i1 as an i8 and now must truncate it back to i1
3156             if (EltVT == MVT::i1)
3157               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3158             // v2f16 was loaded as an i32. Now we must bitcast it back.
3159             else if (EltVT != LoadVT)
3160               Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3161 
3162             // If a promoted integer type is used, truncate down to the original
3163             MVT PromotedVT;
3164             if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3165               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3166             }
3167 
3168             // Extend the element if necessary (e.g. an i8 is loaded
3169             // into an i16 register)
3170             if (Ins[InsIdx].VT.isInteger() &&
3171                 Ins[InsIdx].VT.getFixedSizeInBits() >
3172                     LoadVT.getFixedSizeInBits()) {
3173               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3174                                                            : ISD::ZERO_EXTEND;
3175               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3176             }
3177             InVals.push_back(Elt);
3178           }
3179 
3180           // Reset vector tracking state.
3181           VecIdx = -1;
3182         }
3183         ++InsIdx;
3184       }
3185       if (VTs.size() > 0)
3186         --InsIdx;
3187       continue;
3188     }
3189 
3190     // Param has ByVal attribute
3191     // Return MoveParam(param symbol).
3192     // Ideally, the param symbol can be returned directly,
3193     // but when SDNode builder decides to use it in a CopyToReg(),
3194     // machine instruction fails because TargetExternalSymbol
3195     // (not lowered) is target dependent, and CopyToReg assumes
3196     // the source is lowered.
3197     EVT ObjectVT = getValueType(DL, Ty);
3198     assert(ObjectVT == Ins[InsIdx].VT &&
3199            "Ins type did not match function type");
3200     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3201     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3202     if (p.getNode())
3203       p.getNode()->setIROrder(idx + 1);
3204     InVals.push_back(p);
3205   }
3206 
3207   if (!OutChains.empty())
3208     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3209 
3210   return Chain;
3211 }
3212 
3213 SDValue
3214 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3215                                  bool isVarArg,
3216                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
3217                                  const SmallVectorImpl<SDValue> &OutVals,
3218                                  const SDLoc &dl, SelectionDAG &DAG) const {
3219   const MachineFunction &MF = DAG.getMachineFunction();
3220   const Function &F = MF.getFunction();
3221   Type *RetTy = MF.getFunction().getReturnType();
3222 
3223   bool isABI = (STI.getSmVersion() >= 20);
3224   assert(isABI && "Non-ABI compilation is not supported");
3225   if (!isABI)
3226     return Chain;
3227 
3228   const DataLayout &DL = DAG.getDataLayout();
3229   SmallVector<SDValue, 16> PromotedOutVals;
3230   SmallVector<EVT, 16> VTs;
3231   SmallVector<uint64_t, 16> Offsets;
3232   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3233   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3234 
3235   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3236     SDValue PromotedOutVal = OutVals[i];
3237     MVT PromotedVT;
3238     if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3239       VTs[i] = EVT(PromotedVT);
3240     }
3241     if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3242       llvm::ISD::NodeType Ext =
3243           Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3244       PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3245     }
3246     PromotedOutVals.push_back(PromotedOutVal);
3247   }
3248 
3249   auto VectorInfo = VectorizePTXValueVTs(
3250       VTs, Offsets,
3251       RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3252                        : Align(1));
3253 
3254   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3255   // 32-bits are sign extended or zero extended, depending on whether
3256   // they are signed or unsigned types.
3257   bool ExtendIntegerRetVal =
3258       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3259 
3260   SmallVector<SDValue, 6> StoreOperands;
3261   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3262     // New load/store. Record chain and offset operands.
3263     if (VectorInfo[i] & PVF_FIRST) {
3264       assert(StoreOperands.empty() && "Orphaned operand list.");
3265       StoreOperands.push_back(Chain);
3266       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3267     }
3268 
3269     SDValue OutVal = OutVals[i];
3270     SDValue RetVal = PromotedOutVals[i];
3271 
3272     if (ExtendIntegerRetVal) {
3273       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3274                                                   : ISD::ZERO_EXTEND,
3275                            dl, MVT::i32, RetVal);
3276     } else if (OutVal.getValueSizeInBits() < 16) {
3277       // Use 16-bit registers for small load-stores as it's the
3278       // smallest general purpose register size supported by NVPTX.
3279       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3280     }
3281 
3282     // Record the value to return.
3283     StoreOperands.push_back(RetVal);
3284 
3285     // That's the last element of this store op.
3286     if (VectorInfo[i] & PVF_LAST) {
3287       NVPTXISD::NodeType Op;
3288       unsigned NumElts = StoreOperands.size() - 2;
3289       switch (NumElts) {
3290       case 1:
3291         Op = NVPTXISD::StoreRetval;
3292         break;
3293       case 2:
3294         Op = NVPTXISD::StoreRetvalV2;
3295         break;
3296       case 4:
3297         Op = NVPTXISD::StoreRetvalV4;
3298         break;
3299       default:
3300         llvm_unreachable("Invalid vector info.");
3301       }
3302 
3303       // Adjust type of load/store op if we've extended the scalar
3304       // return value.
3305       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3306       Chain = DAG.getMemIntrinsicNode(
3307           Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3308           MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3309       // Cleanup vector state.
3310       StoreOperands.clear();
3311     }
3312   }
3313 
3314   return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3315 }
3316 
3317 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3318     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3319     SelectionDAG &DAG) const {
3320   if (Constraint.size() > 1)
3321     return;
3322   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3323 }
3324 
3325 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3326   switch (Intrinsic) {
3327   default:
3328     return 0;
3329 
3330   case Intrinsic::nvvm_tex_1d_v4f32_s32:
3331     return NVPTXISD::Tex1DFloatS32;
3332   case Intrinsic::nvvm_tex_1d_v4f32_f32:
3333     return NVPTXISD::Tex1DFloatFloat;
3334   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3335     return NVPTXISD::Tex1DFloatFloatLevel;
3336   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3337     return NVPTXISD::Tex1DFloatFloatGrad;
3338   case Intrinsic::nvvm_tex_1d_v4s32_s32:
3339     return NVPTXISD::Tex1DS32S32;
3340   case Intrinsic::nvvm_tex_1d_v4s32_f32:
3341     return NVPTXISD::Tex1DS32Float;
3342   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3343     return NVPTXISD::Tex1DS32FloatLevel;
3344   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3345     return NVPTXISD::Tex1DS32FloatGrad;
3346   case Intrinsic::nvvm_tex_1d_v4u32_s32:
3347     return NVPTXISD::Tex1DU32S32;
3348   case Intrinsic::nvvm_tex_1d_v4u32_f32:
3349     return NVPTXISD::Tex1DU32Float;
3350   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3351     return NVPTXISD::Tex1DU32FloatLevel;
3352   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3353     return NVPTXISD::Tex1DU32FloatGrad;
3354 
3355   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3356     return NVPTXISD::Tex1DArrayFloatS32;
3357   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3358     return NVPTXISD::Tex1DArrayFloatFloat;
3359   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3360     return NVPTXISD::Tex1DArrayFloatFloatLevel;
3361   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3362     return NVPTXISD::Tex1DArrayFloatFloatGrad;
3363   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3364     return NVPTXISD::Tex1DArrayS32S32;
3365   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3366     return NVPTXISD::Tex1DArrayS32Float;
3367   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3368     return NVPTXISD::Tex1DArrayS32FloatLevel;
3369   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3370     return NVPTXISD::Tex1DArrayS32FloatGrad;
3371   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3372     return NVPTXISD::Tex1DArrayU32S32;
3373   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3374     return NVPTXISD::Tex1DArrayU32Float;
3375   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3376     return NVPTXISD::Tex1DArrayU32FloatLevel;
3377   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3378     return NVPTXISD::Tex1DArrayU32FloatGrad;
3379 
3380   case Intrinsic::nvvm_tex_2d_v4f32_s32:
3381     return NVPTXISD::Tex2DFloatS32;
3382   case Intrinsic::nvvm_tex_2d_v4f32_f32:
3383     return NVPTXISD::Tex2DFloatFloat;
3384   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3385     return NVPTXISD::Tex2DFloatFloatLevel;
3386   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3387     return NVPTXISD::Tex2DFloatFloatGrad;
3388   case Intrinsic::nvvm_tex_2d_v4s32_s32:
3389     return NVPTXISD::Tex2DS32S32;
3390   case Intrinsic::nvvm_tex_2d_v4s32_f32:
3391     return NVPTXISD::Tex2DS32Float;
3392   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3393     return NVPTXISD::Tex2DS32FloatLevel;
3394   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3395     return NVPTXISD::Tex2DS32FloatGrad;
3396   case Intrinsic::nvvm_tex_2d_v4u32_s32:
3397     return NVPTXISD::Tex2DU32S32;
3398   case Intrinsic::nvvm_tex_2d_v4u32_f32:
3399     return NVPTXISD::Tex2DU32Float;
3400   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3401     return NVPTXISD::Tex2DU32FloatLevel;
3402   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3403     return NVPTXISD::Tex2DU32FloatGrad;
3404 
3405   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3406     return NVPTXISD::Tex2DArrayFloatS32;
3407   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3408     return NVPTXISD::Tex2DArrayFloatFloat;
3409   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3410     return NVPTXISD::Tex2DArrayFloatFloatLevel;
3411   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3412     return NVPTXISD::Tex2DArrayFloatFloatGrad;
3413   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3414     return NVPTXISD::Tex2DArrayS32S32;
3415   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3416     return NVPTXISD::Tex2DArrayS32Float;
3417   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3418     return NVPTXISD::Tex2DArrayS32FloatLevel;
3419   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3420     return NVPTXISD::Tex2DArrayS32FloatGrad;
3421   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3422     return NVPTXISD::Tex2DArrayU32S32;
3423   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3424     return NVPTXISD::Tex2DArrayU32Float;
3425   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3426     return NVPTXISD::Tex2DArrayU32FloatLevel;
3427   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3428     return NVPTXISD::Tex2DArrayU32FloatGrad;
3429 
3430   case Intrinsic::nvvm_tex_3d_v4f32_s32:
3431     return NVPTXISD::Tex3DFloatS32;
3432   case Intrinsic::nvvm_tex_3d_v4f32_f32:
3433     return NVPTXISD::Tex3DFloatFloat;
3434   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3435     return NVPTXISD::Tex3DFloatFloatLevel;
3436   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3437     return NVPTXISD::Tex3DFloatFloatGrad;
3438   case Intrinsic::nvvm_tex_3d_v4s32_s32:
3439     return NVPTXISD::Tex3DS32S32;
3440   case Intrinsic::nvvm_tex_3d_v4s32_f32:
3441     return NVPTXISD::Tex3DS32Float;
3442   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3443     return NVPTXISD::Tex3DS32FloatLevel;
3444   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3445     return NVPTXISD::Tex3DS32FloatGrad;
3446   case Intrinsic::nvvm_tex_3d_v4u32_s32:
3447     return NVPTXISD::Tex3DU32S32;
3448   case Intrinsic::nvvm_tex_3d_v4u32_f32:
3449     return NVPTXISD::Tex3DU32Float;
3450   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3451     return NVPTXISD::Tex3DU32FloatLevel;
3452   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3453     return NVPTXISD::Tex3DU32FloatGrad;
3454 
3455   case Intrinsic::nvvm_tex_cube_v4f32_f32:
3456     return NVPTXISD::TexCubeFloatFloat;
3457   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3458     return NVPTXISD::TexCubeFloatFloatLevel;
3459   case Intrinsic::nvvm_tex_cube_v4s32_f32:
3460     return NVPTXISD::TexCubeS32Float;
3461   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3462     return NVPTXISD::TexCubeS32FloatLevel;
3463   case Intrinsic::nvvm_tex_cube_v4u32_f32:
3464     return NVPTXISD::TexCubeU32Float;
3465   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3466     return NVPTXISD::TexCubeU32FloatLevel;
3467 
3468   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3469     return NVPTXISD::TexCubeArrayFloatFloat;
3470   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3471     return NVPTXISD::TexCubeArrayFloatFloatLevel;
3472   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3473     return NVPTXISD::TexCubeArrayS32Float;
3474   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3475     return NVPTXISD::TexCubeArrayS32FloatLevel;
3476   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3477     return NVPTXISD::TexCubeArrayU32Float;
3478   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3479     return NVPTXISD::TexCubeArrayU32FloatLevel;
3480 
3481   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3482     return NVPTXISD::Tld4R2DFloatFloat;
3483   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3484     return NVPTXISD::Tld4G2DFloatFloat;
3485   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3486     return NVPTXISD::Tld4B2DFloatFloat;
3487   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3488     return NVPTXISD::Tld4A2DFloatFloat;
3489   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3490     return NVPTXISD::Tld4R2DS64Float;
3491   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3492     return NVPTXISD::Tld4G2DS64Float;
3493   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3494     return NVPTXISD::Tld4B2DS64Float;
3495   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3496     return NVPTXISD::Tld4A2DS64Float;
3497   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3498     return NVPTXISD::Tld4R2DU64Float;
3499   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3500     return NVPTXISD::Tld4G2DU64Float;
3501   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3502     return NVPTXISD::Tld4B2DU64Float;
3503   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3504     return NVPTXISD::Tld4A2DU64Float;
3505 
3506   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3507     return NVPTXISD::TexUnified1DFloatS32;
3508   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3509     return NVPTXISD::TexUnified1DFloatFloat;
3510   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3511     return NVPTXISD::TexUnified1DFloatFloatLevel;
3512   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3513     return NVPTXISD::TexUnified1DFloatFloatGrad;
3514   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3515     return NVPTXISD::TexUnified1DS32S32;
3516   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3517     return NVPTXISD::TexUnified1DS32Float;
3518   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3519     return NVPTXISD::TexUnified1DS32FloatLevel;
3520   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3521     return NVPTXISD::TexUnified1DS32FloatGrad;
3522   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3523     return NVPTXISD::TexUnified1DU32S32;
3524   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3525     return NVPTXISD::TexUnified1DU32Float;
3526   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3527     return NVPTXISD::TexUnified1DU32FloatLevel;
3528   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3529     return NVPTXISD::TexUnified1DU32FloatGrad;
3530 
3531   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3532     return NVPTXISD::TexUnified1DArrayFloatS32;
3533   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3534     return NVPTXISD::TexUnified1DArrayFloatFloat;
3535   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3536     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3537   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3538     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3539   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3540     return NVPTXISD::TexUnified1DArrayS32S32;
3541   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3542     return NVPTXISD::TexUnified1DArrayS32Float;
3543   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3544     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3545   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3546     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3547   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3548     return NVPTXISD::TexUnified1DArrayU32S32;
3549   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3550     return NVPTXISD::TexUnified1DArrayU32Float;
3551   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3552     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3553   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3554     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3555 
3556   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3557     return NVPTXISD::TexUnified2DFloatS32;
3558   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3559     return NVPTXISD::TexUnified2DFloatFloat;
3560   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3561     return NVPTXISD::TexUnified2DFloatFloatLevel;
3562   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3563     return NVPTXISD::TexUnified2DFloatFloatGrad;
3564   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3565     return NVPTXISD::TexUnified2DS32S32;
3566   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3567     return NVPTXISD::TexUnified2DS32Float;
3568   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3569     return NVPTXISD::TexUnified2DS32FloatLevel;
3570   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3571     return NVPTXISD::TexUnified2DS32FloatGrad;
3572   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3573     return NVPTXISD::TexUnified2DU32S32;
3574   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3575     return NVPTXISD::TexUnified2DU32Float;
3576   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3577     return NVPTXISD::TexUnified2DU32FloatLevel;
3578   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3579     return NVPTXISD::TexUnified2DU32FloatGrad;
3580 
3581   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3582     return NVPTXISD::TexUnified2DArrayFloatS32;
3583   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3584     return NVPTXISD::TexUnified2DArrayFloatFloat;
3585   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3586     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3587   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3588     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3589   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3590     return NVPTXISD::TexUnified2DArrayS32S32;
3591   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3592     return NVPTXISD::TexUnified2DArrayS32Float;
3593   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3594     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3595   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3596     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3597   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3598     return NVPTXISD::TexUnified2DArrayU32S32;
3599   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3600     return NVPTXISD::TexUnified2DArrayU32Float;
3601   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3602     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3603   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3604     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3605 
3606   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3607     return NVPTXISD::TexUnified3DFloatS32;
3608   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3609     return NVPTXISD::TexUnified3DFloatFloat;
3610   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3611     return NVPTXISD::TexUnified3DFloatFloatLevel;
3612   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3613     return NVPTXISD::TexUnified3DFloatFloatGrad;
3614   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3615     return NVPTXISD::TexUnified3DS32S32;
3616   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3617     return NVPTXISD::TexUnified3DS32Float;
3618   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3619     return NVPTXISD::TexUnified3DS32FloatLevel;
3620   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3621     return NVPTXISD::TexUnified3DS32FloatGrad;
3622   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3623     return NVPTXISD::TexUnified3DU32S32;
3624   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3625     return NVPTXISD::TexUnified3DU32Float;
3626   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3627     return NVPTXISD::TexUnified3DU32FloatLevel;
3628   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3629     return NVPTXISD::TexUnified3DU32FloatGrad;
3630 
3631   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3632     return NVPTXISD::TexUnifiedCubeFloatFloat;
3633   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3634     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3635   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3636     return NVPTXISD::TexUnifiedCubeS32Float;
3637   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3638     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3639   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3640     return NVPTXISD::TexUnifiedCubeU32Float;
3641   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3642     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3643 
3644   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3645     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3646   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3647     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3648   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3649     return NVPTXISD::TexUnifiedCubeArrayS32Float;
3650   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3651     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3652   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3653     return NVPTXISD::TexUnifiedCubeArrayU32Float;
3654   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3655     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3656 
3657   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3658     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3659   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3660     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3661   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3662     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3663   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3664     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3665   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3666     return NVPTXISD::Tld4UnifiedR2DS64Float;
3667   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3668     return NVPTXISD::Tld4UnifiedG2DS64Float;
3669   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3670     return NVPTXISD::Tld4UnifiedB2DS64Float;
3671   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3672     return NVPTXISD::Tld4UnifiedA2DS64Float;
3673   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3674     return NVPTXISD::Tld4UnifiedR2DU64Float;
3675   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3676     return NVPTXISD::Tld4UnifiedG2DU64Float;
3677   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3678     return NVPTXISD::Tld4UnifiedB2DU64Float;
3679   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3680     return NVPTXISD::Tld4UnifiedA2DU64Float;
3681   }
3682 }
3683 
3684 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3685   switch (Intrinsic) {
3686   default:
3687     return 0;
3688   case Intrinsic::nvvm_suld_1d_i8_clamp:
3689     return NVPTXISD::Suld1DI8Clamp;
3690   case Intrinsic::nvvm_suld_1d_i16_clamp:
3691     return NVPTXISD::Suld1DI16Clamp;
3692   case Intrinsic::nvvm_suld_1d_i32_clamp:
3693     return NVPTXISD::Suld1DI32Clamp;
3694   case Intrinsic::nvvm_suld_1d_i64_clamp:
3695     return NVPTXISD::Suld1DI64Clamp;
3696   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3697     return NVPTXISD::Suld1DV2I8Clamp;
3698   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3699     return NVPTXISD::Suld1DV2I16Clamp;
3700   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3701     return NVPTXISD::Suld1DV2I32Clamp;
3702   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3703     return NVPTXISD::Suld1DV2I64Clamp;
3704   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3705     return NVPTXISD::Suld1DV4I8Clamp;
3706   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3707     return NVPTXISD::Suld1DV4I16Clamp;
3708   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3709     return NVPTXISD::Suld1DV4I32Clamp;
3710   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3711     return NVPTXISD::Suld1DArrayI8Clamp;
3712   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3713     return NVPTXISD::Suld1DArrayI16Clamp;
3714   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3715     return NVPTXISD::Suld1DArrayI32Clamp;
3716   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3717     return NVPTXISD::Suld1DArrayI64Clamp;
3718   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3719     return NVPTXISD::Suld1DArrayV2I8Clamp;
3720   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3721     return NVPTXISD::Suld1DArrayV2I16Clamp;
3722   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3723     return NVPTXISD::Suld1DArrayV2I32Clamp;
3724   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3725     return NVPTXISD::Suld1DArrayV2I64Clamp;
3726   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3727     return NVPTXISD::Suld1DArrayV4I8Clamp;
3728   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3729     return NVPTXISD::Suld1DArrayV4I16Clamp;
3730   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3731     return NVPTXISD::Suld1DArrayV4I32Clamp;
3732   case Intrinsic::nvvm_suld_2d_i8_clamp:
3733     return NVPTXISD::Suld2DI8Clamp;
3734   case Intrinsic::nvvm_suld_2d_i16_clamp:
3735     return NVPTXISD::Suld2DI16Clamp;
3736   case Intrinsic::nvvm_suld_2d_i32_clamp:
3737     return NVPTXISD::Suld2DI32Clamp;
3738   case Intrinsic::nvvm_suld_2d_i64_clamp:
3739     return NVPTXISD::Suld2DI64Clamp;
3740   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3741     return NVPTXISD::Suld2DV2I8Clamp;
3742   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3743     return NVPTXISD::Suld2DV2I16Clamp;
3744   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3745     return NVPTXISD::Suld2DV2I32Clamp;
3746   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3747     return NVPTXISD::Suld2DV2I64Clamp;
3748   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3749     return NVPTXISD::Suld2DV4I8Clamp;
3750   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3751     return NVPTXISD::Suld2DV4I16Clamp;
3752   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3753     return NVPTXISD::Suld2DV4I32Clamp;
3754   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3755     return NVPTXISD::Suld2DArrayI8Clamp;
3756   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3757     return NVPTXISD::Suld2DArrayI16Clamp;
3758   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3759     return NVPTXISD::Suld2DArrayI32Clamp;
3760   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3761     return NVPTXISD::Suld2DArrayI64Clamp;
3762   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3763     return NVPTXISD::Suld2DArrayV2I8Clamp;
3764   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3765     return NVPTXISD::Suld2DArrayV2I16Clamp;
3766   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3767     return NVPTXISD::Suld2DArrayV2I32Clamp;
3768   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3769     return NVPTXISD::Suld2DArrayV2I64Clamp;
3770   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3771     return NVPTXISD::Suld2DArrayV4I8Clamp;
3772   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3773     return NVPTXISD::Suld2DArrayV4I16Clamp;
3774   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3775     return NVPTXISD::Suld2DArrayV4I32Clamp;
3776   case Intrinsic::nvvm_suld_3d_i8_clamp:
3777     return NVPTXISD::Suld3DI8Clamp;
3778   case Intrinsic::nvvm_suld_3d_i16_clamp:
3779     return NVPTXISD::Suld3DI16Clamp;
3780   case Intrinsic::nvvm_suld_3d_i32_clamp:
3781     return NVPTXISD::Suld3DI32Clamp;
3782   case Intrinsic::nvvm_suld_3d_i64_clamp:
3783     return NVPTXISD::Suld3DI64Clamp;
3784   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3785     return NVPTXISD::Suld3DV2I8Clamp;
3786   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3787     return NVPTXISD::Suld3DV2I16Clamp;
3788   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3789     return NVPTXISD::Suld3DV2I32Clamp;
3790   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3791     return NVPTXISD::Suld3DV2I64Clamp;
3792   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3793     return NVPTXISD::Suld3DV4I8Clamp;
3794   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3795     return NVPTXISD::Suld3DV4I16Clamp;
3796   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3797     return NVPTXISD::Suld3DV4I32Clamp;
3798   case Intrinsic::nvvm_suld_1d_i8_trap:
3799     return NVPTXISD::Suld1DI8Trap;
3800   case Intrinsic::nvvm_suld_1d_i16_trap:
3801     return NVPTXISD::Suld1DI16Trap;
3802   case Intrinsic::nvvm_suld_1d_i32_trap:
3803     return NVPTXISD::Suld1DI32Trap;
3804   case Intrinsic::nvvm_suld_1d_i64_trap:
3805     return NVPTXISD::Suld1DI64Trap;
3806   case Intrinsic::nvvm_suld_1d_v2i8_trap:
3807     return NVPTXISD::Suld1DV2I8Trap;
3808   case Intrinsic::nvvm_suld_1d_v2i16_trap:
3809     return NVPTXISD::Suld1DV2I16Trap;
3810   case Intrinsic::nvvm_suld_1d_v2i32_trap:
3811     return NVPTXISD::Suld1DV2I32Trap;
3812   case Intrinsic::nvvm_suld_1d_v2i64_trap:
3813     return NVPTXISD::Suld1DV2I64Trap;
3814   case Intrinsic::nvvm_suld_1d_v4i8_trap:
3815     return NVPTXISD::Suld1DV4I8Trap;
3816   case Intrinsic::nvvm_suld_1d_v4i16_trap:
3817     return NVPTXISD::Suld1DV4I16Trap;
3818   case Intrinsic::nvvm_suld_1d_v4i32_trap:
3819     return NVPTXISD::Suld1DV4I32Trap;
3820   case Intrinsic::nvvm_suld_1d_array_i8_trap:
3821     return NVPTXISD::Suld1DArrayI8Trap;
3822   case Intrinsic::nvvm_suld_1d_array_i16_trap:
3823     return NVPTXISD::Suld1DArrayI16Trap;
3824   case Intrinsic::nvvm_suld_1d_array_i32_trap:
3825     return NVPTXISD::Suld1DArrayI32Trap;
3826   case Intrinsic::nvvm_suld_1d_array_i64_trap:
3827     return NVPTXISD::Suld1DArrayI64Trap;
3828   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3829     return NVPTXISD::Suld1DArrayV2I8Trap;
3830   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3831     return NVPTXISD::Suld1DArrayV2I16Trap;
3832   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3833     return NVPTXISD::Suld1DArrayV2I32Trap;
3834   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3835     return NVPTXISD::Suld1DArrayV2I64Trap;
3836   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3837     return NVPTXISD::Suld1DArrayV4I8Trap;
3838   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3839     return NVPTXISD::Suld1DArrayV4I16Trap;
3840   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3841     return NVPTXISD::Suld1DArrayV4I32Trap;
3842   case Intrinsic::nvvm_suld_2d_i8_trap:
3843     return NVPTXISD::Suld2DI8Trap;
3844   case Intrinsic::nvvm_suld_2d_i16_trap:
3845     return NVPTXISD::Suld2DI16Trap;
3846   case Intrinsic::nvvm_suld_2d_i32_trap:
3847     return NVPTXISD::Suld2DI32Trap;
3848   case Intrinsic::nvvm_suld_2d_i64_trap:
3849     return NVPTXISD::Suld2DI64Trap;
3850   case Intrinsic::nvvm_suld_2d_v2i8_trap:
3851     return NVPTXISD::Suld2DV2I8Trap;
3852   case Intrinsic::nvvm_suld_2d_v2i16_trap:
3853     return NVPTXISD::Suld2DV2I16Trap;
3854   case Intrinsic::nvvm_suld_2d_v2i32_trap:
3855     return NVPTXISD::Suld2DV2I32Trap;
3856   case Intrinsic::nvvm_suld_2d_v2i64_trap:
3857     return NVPTXISD::Suld2DV2I64Trap;
3858   case Intrinsic::nvvm_suld_2d_v4i8_trap:
3859     return NVPTXISD::Suld2DV4I8Trap;
3860   case Intrinsic::nvvm_suld_2d_v4i16_trap:
3861     return NVPTXISD::Suld2DV4I16Trap;
3862   case Intrinsic::nvvm_suld_2d_v4i32_trap:
3863     return NVPTXISD::Suld2DV4I32Trap;
3864   case Intrinsic::nvvm_suld_2d_array_i8_trap:
3865     return NVPTXISD::Suld2DArrayI8Trap;
3866   case Intrinsic::nvvm_suld_2d_array_i16_trap:
3867     return NVPTXISD::Suld2DArrayI16Trap;
3868   case Intrinsic::nvvm_suld_2d_array_i32_trap:
3869     return NVPTXISD::Suld2DArrayI32Trap;
3870   case Intrinsic::nvvm_suld_2d_array_i64_trap:
3871     return NVPTXISD::Suld2DArrayI64Trap;
3872   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3873     return NVPTXISD::Suld2DArrayV2I8Trap;
3874   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3875     return NVPTXISD::Suld2DArrayV2I16Trap;
3876   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3877     return NVPTXISD::Suld2DArrayV2I32Trap;
3878   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3879     return NVPTXISD::Suld2DArrayV2I64Trap;
3880   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3881     return NVPTXISD::Suld2DArrayV4I8Trap;
3882   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3883     return NVPTXISD::Suld2DArrayV4I16Trap;
3884   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3885     return NVPTXISD::Suld2DArrayV4I32Trap;
3886   case Intrinsic::nvvm_suld_3d_i8_trap:
3887     return NVPTXISD::Suld3DI8Trap;
3888   case Intrinsic::nvvm_suld_3d_i16_trap:
3889     return NVPTXISD::Suld3DI16Trap;
3890   case Intrinsic::nvvm_suld_3d_i32_trap:
3891     return NVPTXISD::Suld3DI32Trap;
3892   case Intrinsic::nvvm_suld_3d_i64_trap:
3893     return NVPTXISD::Suld3DI64Trap;
3894   case Intrinsic::nvvm_suld_3d_v2i8_trap:
3895     return NVPTXISD::Suld3DV2I8Trap;
3896   case Intrinsic::nvvm_suld_3d_v2i16_trap:
3897     return NVPTXISD::Suld3DV2I16Trap;
3898   case Intrinsic::nvvm_suld_3d_v2i32_trap:
3899     return NVPTXISD::Suld3DV2I32Trap;
3900   case Intrinsic::nvvm_suld_3d_v2i64_trap:
3901     return NVPTXISD::Suld3DV2I64Trap;
3902   case Intrinsic::nvvm_suld_3d_v4i8_trap:
3903     return NVPTXISD::Suld3DV4I8Trap;
3904   case Intrinsic::nvvm_suld_3d_v4i16_trap:
3905     return NVPTXISD::Suld3DV4I16Trap;
3906   case Intrinsic::nvvm_suld_3d_v4i32_trap:
3907     return NVPTXISD::Suld3DV4I32Trap;
3908   case Intrinsic::nvvm_suld_1d_i8_zero:
3909     return NVPTXISD::Suld1DI8Zero;
3910   case Intrinsic::nvvm_suld_1d_i16_zero:
3911     return NVPTXISD::Suld1DI16Zero;
3912   case Intrinsic::nvvm_suld_1d_i32_zero:
3913     return NVPTXISD::Suld1DI32Zero;
3914   case Intrinsic::nvvm_suld_1d_i64_zero:
3915     return NVPTXISD::Suld1DI64Zero;
3916   case Intrinsic::nvvm_suld_1d_v2i8_zero:
3917     return NVPTXISD::Suld1DV2I8Zero;
3918   case Intrinsic::nvvm_suld_1d_v2i16_zero:
3919     return NVPTXISD::Suld1DV2I16Zero;
3920   case Intrinsic::nvvm_suld_1d_v2i32_zero:
3921     return NVPTXISD::Suld1DV2I32Zero;
3922   case Intrinsic::nvvm_suld_1d_v2i64_zero:
3923     return NVPTXISD::Suld1DV2I64Zero;
3924   case Intrinsic::nvvm_suld_1d_v4i8_zero:
3925     return NVPTXISD::Suld1DV4I8Zero;
3926   case Intrinsic::nvvm_suld_1d_v4i16_zero:
3927     return NVPTXISD::Suld1DV4I16Zero;
3928   case Intrinsic::nvvm_suld_1d_v4i32_zero:
3929     return NVPTXISD::Suld1DV4I32Zero;
3930   case Intrinsic::nvvm_suld_1d_array_i8_zero:
3931     return NVPTXISD::Suld1DArrayI8Zero;
3932   case Intrinsic::nvvm_suld_1d_array_i16_zero:
3933     return NVPTXISD::Suld1DArrayI16Zero;
3934   case Intrinsic::nvvm_suld_1d_array_i32_zero:
3935     return NVPTXISD::Suld1DArrayI32Zero;
3936   case Intrinsic::nvvm_suld_1d_array_i64_zero:
3937     return NVPTXISD::Suld1DArrayI64Zero;
3938   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3939     return NVPTXISD::Suld1DArrayV2I8Zero;
3940   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3941     return NVPTXISD::Suld1DArrayV2I16Zero;
3942   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3943     return NVPTXISD::Suld1DArrayV2I32Zero;
3944   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3945     return NVPTXISD::Suld1DArrayV2I64Zero;
3946   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3947     return NVPTXISD::Suld1DArrayV4I8Zero;
3948   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3949     return NVPTXISD::Suld1DArrayV4I16Zero;
3950   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3951     return NVPTXISD::Suld1DArrayV4I32Zero;
3952   case Intrinsic::nvvm_suld_2d_i8_zero:
3953     return NVPTXISD::Suld2DI8Zero;
3954   case Intrinsic::nvvm_suld_2d_i16_zero:
3955     return NVPTXISD::Suld2DI16Zero;
3956   case Intrinsic::nvvm_suld_2d_i32_zero:
3957     return NVPTXISD::Suld2DI32Zero;
3958   case Intrinsic::nvvm_suld_2d_i64_zero:
3959     return NVPTXISD::Suld2DI64Zero;
3960   case Intrinsic::nvvm_suld_2d_v2i8_zero:
3961     return NVPTXISD::Suld2DV2I8Zero;
3962   case Intrinsic::nvvm_suld_2d_v2i16_zero:
3963     return NVPTXISD::Suld2DV2I16Zero;
3964   case Intrinsic::nvvm_suld_2d_v2i32_zero:
3965     return NVPTXISD::Suld2DV2I32Zero;
3966   case Intrinsic::nvvm_suld_2d_v2i64_zero:
3967     return NVPTXISD::Suld2DV2I64Zero;
3968   case Intrinsic::nvvm_suld_2d_v4i8_zero:
3969     return NVPTXISD::Suld2DV4I8Zero;
3970   case Intrinsic::nvvm_suld_2d_v4i16_zero:
3971     return NVPTXISD::Suld2DV4I16Zero;
3972   case Intrinsic::nvvm_suld_2d_v4i32_zero:
3973     return NVPTXISD::Suld2DV4I32Zero;
3974   case Intrinsic::nvvm_suld_2d_array_i8_zero:
3975     return NVPTXISD::Suld2DArrayI8Zero;
3976   case Intrinsic::nvvm_suld_2d_array_i16_zero:
3977     return NVPTXISD::Suld2DArrayI16Zero;
3978   case Intrinsic::nvvm_suld_2d_array_i32_zero:
3979     return NVPTXISD::Suld2DArrayI32Zero;
3980   case Intrinsic::nvvm_suld_2d_array_i64_zero:
3981     return NVPTXISD::Suld2DArrayI64Zero;
3982   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3983     return NVPTXISD::Suld2DArrayV2I8Zero;
3984   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3985     return NVPTXISD::Suld2DArrayV2I16Zero;
3986   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3987     return NVPTXISD::Suld2DArrayV2I32Zero;
3988   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3989     return NVPTXISD::Suld2DArrayV2I64Zero;
3990   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3991     return NVPTXISD::Suld2DArrayV4I8Zero;
3992   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3993     return NVPTXISD::Suld2DArrayV4I16Zero;
3994   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3995     return NVPTXISD::Suld2DArrayV4I32Zero;
3996   case Intrinsic::nvvm_suld_3d_i8_zero:
3997     return NVPTXISD::Suld3DI8Zero;
3998   case Intrinsic::nvvm_suld_3d_i16_zero:
3999     return NVPTXISD::Suld3DI16Zero;
4000   case Intrinsic::nvvm_suld_3d_i32_zero:
4001     return NVPTXISD::Suld3DI32Zero;
4002   case Intrinsic::nvvm_suld_3d_i64_zero:
4003     return NVPTXISD::Suld3DI64Zero;
4004   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4005     return NVPTXISD::Suld3DV2I8Zero;
4006   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4007     return NVPTXISD::Suld3DV2I16Zero;
4008   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4009     return NVPTXISD::Suld3DV2I32Zero;
4010   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4011     return NVPTXISD::Suld3DV2I64Zero;
4012   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4013     return NVPTXISD::Suld3DV4I8Zero;
4014   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4015     return NVPTXISD::Suld3DV4I16Zero;
4016   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4017     return NVPTXISD::Suld3DV4I32Zero;
4018   }
4019 }
4020 
4021 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4022 // TgtMemIntrinsic
4023 // because we need the information that is only available in the "Value" type
4024 // of destination
4025 // pointer. In particular, the address space information.
4026 bool NVPTXTargetLowering::getTgtMemIntrinsic(
4027     IntrinsicInfo &Info, const CallInst &I,
4028     MachineFunction &MF, unsigned Intrinsic) const {
4029   switch (Intrinsic) {
4030   default:
4031     return false;
4032   case Intrinsic::nvvm_match_all_sync_i32p:
4033   case Intrinsic::nvvm_match_all_sync_i64p:
4034     Info.opc = ISD::INTRINSIC_W_CHAIN;
4035     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4036     // in order to model data exchange with other threads, but perform no real
4037     // memory accesses.
4038     Info.memVT = MVT::i1;
4039 
4040     // Our result depends on both our and other thread's arguments.
4041     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4042     return true;
4043   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4044   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4045   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4046   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4047   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4048   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4049   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4050   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4051   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4052   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4053   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4054   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4055   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4056   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4057   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4058   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4059   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4060   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4061   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4062   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4063   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4064   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4065   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4066   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4067     Info.opc = ISD::INTRINSIC_W_CHAIN;
4068     Info.memVT = MVT::v8f16;
4069     Info.ptrVal = I.getArgOperand(0);
4070     Info.offset = 0;
4071     Info.flags = MachineMemOperand::MOLoad;
4072     Info.align = Align(16);
4073     return true;
4074   }
4075   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4076   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4077   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4078   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4079   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4080   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4081   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4082   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4083   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4084   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4085   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4086   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4087   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4088   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4089   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4090   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4091   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4092   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4093   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4094   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4095   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4096   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4097   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4098   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4099     Info.opc = ISD::INTRINSIC_W_CHAIN;
4100     Info.memVT = MVT::v2i32;
4101     Info.ptrVal = I.getArgOperand(0);
4102     Info.offset = 0;
4103     Info.flags = MachineMemOperand::MOLoad;
4104     Info.align = Align(8);
4105     return true;
4106   }
4107 
4108   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4109   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4110   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4111   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4112   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4113   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4114   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4115   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4116   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4117   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4118   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4119   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4120   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4121   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4122   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4123   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4124 
4125   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4126   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4127   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4128   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4129   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4130   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4131   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4132   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4133   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4134   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4135   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4136   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4137   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4138   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4139   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4140   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4141   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4142   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4143     Info.opc = ISD::INTRINSIC_W_CHAIN;
4144     Info.memVT = MVT::v4i32;
4145     Info.ptrVal = I.getArgOperand(0);
4146     Info.offset = 0;
4147     Info.flags = MachineMemOperand::MOLoad;
4148     Info.align = Align(16);
4149     return true;
4150   }
4151 
4152   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4153   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4154   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4155   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4156   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4157   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4158   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4159   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4160 
4161   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4162   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4163   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4164   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4165   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4166   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4167   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4168   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4169   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4170   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4171   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4172   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4173   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4174   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4175   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4176   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4177   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4178   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4179   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4180   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4181   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4182   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4183     Info.opc = ISD::INTRINSIC_W_CHAIN;
4184     Info.memVT = MVT::i32;
4185     Info.ptrVal = I.getArgOperand(0);
4186     Info.offset = 0;
4187     Info.flags = MachineMemOperand::MOLoad;
4188     Info.align = Align(4);
4189     return true;
4190   }
4191 
4192   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4193   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4194   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4195   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4196   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4197   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4198   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4199   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4200   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4201   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4202   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4203   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4204     Info.opc = ISD::INTRINSIC_W_CHAIN;
4205     Info.memVT = MVT::v4f16;
4206     Info.ptrVal = I.getArgOperand(0);
4207     Info.offset = 0;
4208     Info.flags = MachineMemOperand::MOLoad;
4209     Info.align = Align(16);
4210     return true;
4211   }
4212 
4213   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4214   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4215   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4216   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4217   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4218   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4219   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4220   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4221   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4222   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4223   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4224   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4225   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4226   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4227   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4228   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4229     Info.opc = ISD::INTRINSIC_W_CHAIN;
4230     Info.memVT = MVT::v8f32;
4231     Info.ptrVal = I.getArgOperand(0);
4232     Info.offset = 0;
4233     Info.flags = MachineMemOperand::MOLoad;
4234     Info.align = Align(16);
4235     return true;
4236   }
4237 
4238   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4239   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4240   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4241   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4242 
4243   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4244   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4245   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4246   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4247 
4248   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4249   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4250   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4251   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4252   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4253   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4254   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4255   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4256   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4257   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4258   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4259   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4260     Info.opc = ISD::INTRINSIC_W_CHAIN;
4261     Info.memVT = MVT::v8i32;
4262     Info.ptrVal = I.getArgOperand(0);
4263     Info.offset = 0;
4264     Info.flags = MachineMemOperand::MOLoad;
4265     Info.align = Align(16);
4266     return true;
4267   }
4268 
4269   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4270   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4271   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4272   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4273   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4274   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4275   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4276   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4277   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4278   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4279     Info.opc = ISD::INTRINSIC_W_CHAIN;
4280     Info.memVT = MVT::v2i32;
4281     Info.ptrVal = I.getArgOperand(0);
4282     Info.offset = 0;
4283     Info.flags = MachineMemOperand::MOLoad;
4284     Info.align = Align(8);
4285     return true;
4286   }
4287 
4288   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4289   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4290   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4291   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4292 
4293   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4294   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4295   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4296   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4297     Info.opc = ISD::INTRINSIC_W_CHAIN;
4298     Info.memVT = MVT::f64;
4299     Info.ptrVal = I.getArgOperand(0);
4300     Info.offset = 0;
4301     Info.flags = MachineMemOperand::MOLoad;
4302     Info.align = Align(8);
4303     return true;
4304   }
4305 
4306   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4307   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4308   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4309   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4310     Info.opc = ISD::INTRINSIC_W_CHAIN;
4311     Info.memVT = MVT::v2f64;
4312     Info.ptrVal = I.getArgOperand(0);
4313     Info.offset = 0;
4314     Info.flags = MachineMemOperand::MOLoad;
4315     Info.align = Align(16);
4316     return true;
4317   }
4318 
4319   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4320   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4321   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4322   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4323   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4324   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4325   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4326   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4327   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4328   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4329   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4330   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4331     Info.opc = ISD::INTRINSIC_VOID;
4332     Info.memVT = MVT::v4f16;
4333     Info.ptrVal = I.getArgOperand(0);
4334     Info.offset = 0;
4335     Info.flags = MachineMemOperand::MOStore;
4336     Info.align = Align(16);
4337     return true;
4338   }
4339 
4340   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4341   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4342   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4343   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4344   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4345   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4346   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4347   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4348   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4349   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4350   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4351   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4352   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4353   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4354   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4355   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4356     Info.opc = ISD::INTRINSIC_VOID;
4357     Info.memVT = MVT::v8f32;
4358     Info.ptrVal = I.getArgOperand(0);
4359     Info.offset = 0;
4360     Info.flags = MachineMemOperand::MOStore;
4361     Info.align = Align(16);
4362     return true;
4363   }
4364 
4365   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4366   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4367   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4368   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4369   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4370   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4371   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4372   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4373   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4374   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4375   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4376   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4377     Info.opc = ISD::INTRINSIC_VOID;
4378     Info.memVT = MVT::v8i32;
4379     Info.ptrVal = I.getArgOperand(0);
4380     Info.offset = 0;
4381     Info.flags = MachineMemOperand::MOStore;
4382     Info.align = Align(16);
4383     return true;
4384   }
4385 
4386   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4387   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4388   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4389   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4390   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4391   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4392   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4393   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4394     Info.opc = ISD::INTRINSIC_VOID;
4395     Info.memVT = MVT::v2i32;
4396     Info.ptrVal = I.getArgOperand(0);
4397     Info.offset = 0;
4398     Info.flags = MachineMemOperand::MOStore;
4399     Info.align = Align(8);
4400     return true;
4401   }
4402 
4403   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4404   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4405   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4406   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4407     Info.opc = ISD::INTRINSIC_VOID;
4408     Info.memVT = MVT::v2f64;
4409     Info.ptrVal = I.getArgOperand(0);
4410     Info.offset = 0;
4411     Info.flags = MachineMemOperand::MOStore;
4412     Info.align = Align(16);
4413     return true;
4414   }
4415 
4416   case Intrinsic::nvvm_atomic_load_inc_32:
4417   case Intrinsic::nvvm_atomic_load_dec_32:
4418 
4419   case Intrinsic::nvvm_atomic_add_gen_f_cta:
4420   case Intrinsic::nvvm_atomic_add_gen_f_sys:
4421   case Intrinsic::nvvm_atomic_add_gen_i_cta:
4422   case Intrinsic::nvvm_atomic_add_gen_i_sys:
4423   case Intrinsic::nvvm_atomic_and_gen_i_cta:
4424   case Intrinsic::nvvm_atomic_and_gen_i_sys:
4425   case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4426   case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4427   case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4428   case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4429   case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4430   case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4431   case Intrinsic::nvvm_atomic_max_gen_i_cta:
4432   case Intrinsic::nvvm_atomic_max_gen_i_sys:
4433   case Intrinsic::nvvm_atomic_min_gen_i_cta:
4434   case Intrinsic::nvvm_atomic_min_gen_i_sys:
4435   case Intrinsic::nvvm_atomic_or_gen_i_cta:
4436   case Intrinsic::nvvm_atomic_or_gen_i_sys:
4437   case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4438   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4439   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4440   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4441     auto &DL = I.getModule()->getDataLayout();
4442     Info.opc = ISD::INTRINSIC_W_CHAIN;
4443     Info.memVT = getValueType(DL, I.getType());
4444     Info.ptrVal = I.getArgOperand(0);
4445     Info.offset = 0;
4446     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4447     Info.align.reset();
4448     return true;
4449   }
4450 
4451   case Intrinsic::nvvm_ldu_global_i:
4452   case Intrinsic::nvvm_ldu_global_f:
4453   case Intrinsic::nvvm_ldu_global_p: {
4454     auto &DL = I.getModule()->getDataLayout();
4455     Info.opc = ISD::INTRINSIC_W_CHAIN;
4456     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4457       Info.memVT = getValueType(DL, I.getType());
4458     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4459       Info.memVT = getPointerTy(DL);
4460     else
4461       Info.memVT = getValueType(DL, I.getType());
4462     Info.ptrVal = I.getArgOperand(0);
4463     Info.offset = 0;
4464     Info.flags = MachineMemOperand::MOLoad;
4465     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4466 
4467     return true;
4468   }
4469   case Intrinsic::nvvm_ldg_global_i:
4470   case Intrinsic::nvvm_ldg_global_f:
4471   case Intrinsic::nvvm_ldg_global_p: {
4472     auto &DL = I.getModule()->getDataLayout();
4473 
4474     Info.opc = ISD::INTRINSIC_W_CHAIN;
4475     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4476       Info.memVT = getValueType(DL, I.getType());
4477     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4478       Info.memVT = getPointerTy(DL);
4479     else
4480       Info.memVT = getValueType(DL, I.getType());
4481     Info.ptrVal = I.getArgOperand(0);
4482     Info.offset = 0;
4483     Info.flags = MachineMemOperand::MOLoad;
4484     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4485 
4486     return true;
4487   }
4488 
4489   case Intrinsic::nvvm_tex_1d_v4f32_s32:
4490   case Intrinsic::nvvm_tex_1d_v4f32_f32:
4491   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4492   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4493   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4494   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4495   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4496   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4497   case Intrinsic::nvvm_tex_2d_v4f32_s32:
4498   case Intrinsic::nvvm_tex_2d_v4f32_f32:
4499   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4500   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4501   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4502   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4503   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4504   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4505   case Intrinsic::nvvm_tex_3d_v4f32_s32:
4506   case Intrinsic::nvvm_tex_3d_v4f32_f32:
4507   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4508   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4509   case Intrinsic::nvvm_tex_cube_v4f32_f32:
4510   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4511   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4512   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4513   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4514   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4515   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4516   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4517   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4518   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4519   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4520   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4521   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4522   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4523   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4524   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4525   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4526   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4527   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4528   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4529   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4530   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4531   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4532   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4533   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4534   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4535   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4536   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4537   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4538   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4539   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4540   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4541   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4542   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4543   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4544   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4545     Info.opc = getOpcForTextureInstr(Intrinsic);
4546     Info.memVT = MVT::v4f32;
4547     Info.ptrVal = nullptr;
4548     Info.offset = 0;
4549     Info.flags = MachineMemOperand::MOLoad;
4550     Info.align = Align(16);
4551     return true;
4552 
4553   case Intrinsic::nvvm_tex_1d_v4s32_s32:
4554   case Intrinsic::nvvm_tex_1d_v4s32_f32:
4555   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4556   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4557   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4558   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4559   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4560   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4561   case Intrinsic::nvvm_tex_2d_v4s32_s32:
4562   case Intrinsic::nvvm_tex_2d_v4s32_f32:
4563   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4564   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4565   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4566   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4567   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4568   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4569   case Intrinsic::nvvm_tex_3d_v4s32_s32:
4570   case Intrinsic::nvvm_tex_3d_v4s32_f32:
4571   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4572   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4573   case Intrinsic::nvvm_tex_cube_v4s32_f32:
4574   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4575   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4576   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4577   case Intrinsic::nvvm_tex_cube_v4u32_f32:
4578   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4579   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4580   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4581   case Intrinsic::nvvm_tex_1d_v4u32_s32:
4582   case Intrinsic::nvvm_tex_1d_v4u32_f32:
4583   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4584   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4585   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4586   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4587   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4588   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4589   case Intrinsic::nvvm_tex_2d_v4u32_s32:
4590   case Intrinsic::nvvm_tex_2d_v4u32_f32:
4591   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4592   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4593   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4594   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4595   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4596   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4597   case Intrinsic::nvvm_tex_3d_v4u32_s32:
4598   case Intrinsic::nvvm_tex_3d_v4u32_f32:
4599   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4600   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4601   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4602   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4603   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4604   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4605   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4606   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4607   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4608   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4609   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4610   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4611   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4612   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4613   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4614   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4615   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4616   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4617   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4618   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4619   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4620   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4621   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4622   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4623   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4624   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4625   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4626   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4627   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4628   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4629   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4630   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4631   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4632   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4633   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4634   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4635   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4636   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4637   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4638   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4639   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4640   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4641   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4642   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4643   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4644   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4645   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4646   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4647   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4648   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4649   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4650   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4651   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4652   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4653   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4654   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4655   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4656   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4657   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4658   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4659   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4660   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4661   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4662   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4663   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4664   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4665     Info.opc = getOpcForTextureInstr(Intrinsic);
4666     Info.memVT = MVT::v4i32;
4667     Info.ptrVal = nullptr;
4668     Info.offset = 0;
4669     Info.flags = MachineMemOperand::MOLoad;
4670     Info.align = Align(16);
4671     return true;
4672 
4673   case Intrinsic::nvvm_suld_1d_i8_clamp:
4674   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4675   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4676   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4677   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4678   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4679   case Intrinsic::nvvm_suld_2d_i8_clamp:
4680   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4681   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4682   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4683   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4684   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4685   case Intrinsic::nvvm_suld_3d_i8_clamp:
4686   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4687   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4688   case Intrinsic::nvvm_suld_1d_i8_trap:
4689   case Intrinsic::nvvm_suld_1d_v2i8_trap:
4690   case Intrinsic::nvvm_suld_1d_v4i8_trap:
4691   case Intrinsic::nvvm_suld_1d_array_i8_trap:
4692   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4693   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4694   case Intrinsic::nvvm_suld_2d_i8_trap:
4695   case Intrinsic::nvvm_suld_2d_v2i8_trap:
4696   case Intrinsic::nvvm_suld_2d_v4i8_trap:
4697   case Intrinsic::nvvm_suld_2d_array_i8_trap:
4698   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4699   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4700   case Intrinsic::nvvm_suld_3d_i8_trap:
4701   case Intrinsic::nvvm_suld_3d_v2i8_trap:
4702   case Intrinsic::nvvm_suld_3d_v4i8_trap:
4703   case Intrinsic::nvvm_suld_1d_i8_zero:
4704   case Intrinsic::nvvm_suld_1d_v2i8_zero:
4705   case Intrinsic::nvvm_suld_1d_v4i8_zero:
4706   case Intrinsic::nvvm_suld_1d_array_i8_zero:
4707   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4708   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4709   case Intrinsic::nvvm_suld_2d_i8_zero:
4710   case Intrinsic::nvvm_suld_2d_v2i8_zero:
4711   case Intrinsic::nvvm_suld_2d_v4i8_zero:
4712   case Intrinsic::nvvm_suld_2d_array_i8_zero:
4713   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4714   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4715   case Intrinsic::nvvm_suld_3d_i8_zero:
4716   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4717   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4718     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4719     Info.memVT = MVT::i8;
4720     Info.ptrVal = nullptr;
4721     Info.offset = 0;
4722     Info.flags = MachineMemOperand::MOLoad;
4723     Info.align = Align(16);
4724     return true;
4725 
4726   case Intrinsic::nvvm_suld_1d_i16_clamp:
4727   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4728   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4729   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4730   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4731   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4732   case Intrinsic::nvvm_suld_2d_i16_clamp:
4733   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4734   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4735   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4736   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4737   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4738   case Intrinsic::nvvm_suld_3d_i16_clamp:
4739   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4740   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4741   case Intrinsic::nvvm_suld_1d_i16_trap:
4742   case Intrinsic::nvvm_suld_1d_v2i16_trap:
4743   case Intrinsic::nvvm_suld_1d_v4i16_trap:
4744   case Intrinsic::nvvm_suld_1d_array_i16_trap:
4745   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4746   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4747   case Intrinsic::nvvm_suld_2d_i16_trap:
4748   case Intrinsic::nvvm_suld_2d_v2i16_trap:
4749   case Intrinsic::nvvm_suld_2d_v4i16_trap:
4750   case Intrinsic::nvvm_suld_2d_array_i16_trap:
4751   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4752   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4753   case Intrinsic::nvvm_suld_3d_i16_trap:
4754   case Intrinsic::nvvm_suld_3d_v2i16_trap:
4755   case Intrinsic::nvvm_suld_3d_v4i16_trap:
4756   case Intrinsic::nvvm_suld_1d_i16_zero:
4757   case Intrinsic::nvvm_suld_1d_v2i16_zero:
4758   case Intrinsic::nvvm_suld_1d_v4i16_zero:
4759   case Intrinsic::nvvm_suld_1d_array_i16_zero:
4760   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4761   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4762   case Intrinsic::nvvm_suld_2d_i16_zero:
4763   case Intrinsic::nvvm_suld_2d_v2i16_zero:
4764   case Intrinsic::nvvm_suld_2d_v4i16_zero:
4765   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4766   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4767   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4768   case Intrinsic::nvvm_suld_3d_i16_zero:
4769   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4770   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4771     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4772     Info.memVT = MVT::i16;
4773     Info.ptrVal = nullptr;
4774     Info.offset = 0;
4775     Info.flags = MachineMemOperand::MOLoad;
4776     Info.align = Align(16);
4777     return true;
4778 
4779   case Intrinsic::nvvm_suld_1d_i32_clamp:
4780   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4781   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4782   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4783   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4784   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4785   case Intrinsic::nvvm_suld_2d_i32_clamp:
4786   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4787   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4788   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4789   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4790   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4791   case Intrinsic::nvvm_suld_3d_i32_clamp:
4792   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4793   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4794   case Intrinsic::nvvm_suld_1d_i32_trap:
4795   case Intrinsic::nvvm_suld_1d_v2i32_trap:
4796   case Intrinsic::nvvm_suld_1d_v4i32_trap:
4797   case Intrinsic::nvvm_suld_1d_array_i32_trap:
4798   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4799   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4800   case Intrinsic::nvvm_suld_2d_i32_trap:
4801   case Intrinsic::nvvm_suld_2d_v2i32_trap:
4802   case Intrinsic::nvvm_suld_2d_v4i32_trap:
4803   case Intrinsic::nvvm_suld_2d_array_i32_trap:
4804   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4805   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4806   case Intrinsic::nvvm_suld_3d_i32_trap:
4807   case Intrinsic::nvvm_suld_3d_v2i32_trap:
4808   case Intrinsic::nvvm_suld_3d_v4i32_trap:
4809   case Intrinsic::nvvm_suld_1d_i32_zero:
4810   case Intrinsic::nvvm_suld_1d_v2i32_zero:
4811   case Intrinsic::nvvm_suld_1d_v4i32_zero:
4812   case Intrinsic::nvvm_suld_1d_array_i32_zero:
4813   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4814   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4815   case Intrinsic::nvvm_suld_2d_i32_zero:
4816   case Intrinsic::nvvm_suld_2d_v2i32_zero:
4817   case Intrinsic::nvvm_suld_2d_v4i32_zero:
4818   case Intrinsic::nvvm_suld_2d_array_i32_zero:
4819   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4820   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4821   case Intrinsic::nvvm_suld_3d_i32_zero:
4822   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4823   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4824     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4825     Info.memVT = MVT::i32;
4826     Info.ptrVal = nullptr;
4827     Info.offset = 0;
4828     Info.flags = MachineMemOperand::MOLoad;
4829     Info.align = Align(16);
4830     return true;
4831 
4832   case Intrinsic::nvvm_suld_1d_i64_clamp:
4833   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4834   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4835   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4836   case Intrinsic::nvvm_suld_2d_i64_clamp:
4837   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4838   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4839   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4840   case Intrinsic::nvvm_suld_3d_i64_clamp:
4841   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4842   case Intrinsic::nvvm_suld_1d_i64_trap:
4843   case Intrinsic::nvvm_suld_1d_v2i64_trap:
4844   case Intrinsic::nvvm_suld_1d_array_i64_trap:
4845   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4846   case Intrinsic::nvvm_suld_2d_i64_trap:
4847   case Intrinsic::nvvm_suld_2d_v2i64_trap:
4848   case Intrinsic::nvvm_suld_2d_array_i64_trap:
4849   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4850   case Intrinsic::nvvm_suld_3d_i64_trap:
4851   case Intrinsic::nvvm_suld_3d_v2i64_trap:
4852   case Intrinsic::nvvm_suld_1d_i64_zero:
4853   case Intrinsic::nvvm_suld_1d_v2i64_zero:
4854   case Intrinsic::nvvm_suld_1d_array_i64_zero:
4855   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4856   case Intrinsic::nvvm_suld_2d_i64_zero:
4857   case Intrinsic::nvvm_suld_2d_v2i64_zero:
4858   case Intrinsic::nvvm_suld_2d_array_i64_zero:
4859   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4860   case Intrinsic::nvvm_suld_3d_i64_zero:
4861   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4862     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4863     Info.memVT = MVT::i64;
4864     Info.ptrVal = nullptr;
4865     Info.offset = 0;
4866     Info.flags = MachineMemOperand::MOLoad;
4867     Info.align = Align(16);
4868     return true;
4869   }
4870   return false;
4871 }
4872 
4873 /// getFunctionParamOptimizedAlign - since function arguments are passed via
4874 /// .param space, we may want to increase their alignment in a way that
4875 /// ensures that we can effectively vectorize their loads & stores. We can
4876 /// increase alignment only if the function has internal or has private
4877 /// linkage as for other linkage types callers may already rely on default
4878 /// alignment. To allow using 128-bit vectorized loads/stores, this function
4879 /// ensures that alignment is 16 or greater.
4880 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
4881     const Function *F, Type *ArgTy, const DataLayout &DL) const {
4882   const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
4883 
4884   // If a function has linkage different from internal or private, we
4885   // must use default ABI alignment as external users rely on it. Same
4886   // for a function that may be called from a function pointer.
4887   if (!F || !F->hasLocalLinkage() ||
4888       F->hasAddressTaken(/*Users=*/nullptr,
4889                          /*IgnoreCallbackUses=*/false,
4890                          /*IgnoreAssumeLikeCalls=*/true,
4891                          /*IgnoreLLVMUsed=*/true))
4892     return Align(ABITypeAlign);
4893 
4894   assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4895   return Align(std::max(uint64_t(16), ABITypeAlign));
4896 }
4897 
4898 /// Helper for computing alignment of a device function byval parameter.
4899 Align NVPTXTargetLowering::getFunctionByValParamAlign(
4900     const Function *F, Type *ArgTy, Align InitialAlign,
4901     const DataLayout &DL) const {
4902   Align ArgAlign = InitialAlign;
4903   // Try to increase alignment to enhance vectorization options.
4904   if (F)
4905     ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4906 
4907   // Old ptx versions have a bug. When PTX code takes address of
4908   // byval parameter with alignment < 4, ptxas generates code to
4909   // spill argument into memory. Alas on sm_50+ ptxas generates
4910   // SASS code that fails with misaligned access. To work around
4911   // the problem, make sure that we align byval parameters by at
4912   // least 4. This bug seems to be fixed at least starting from
4913   // ptxas > 9.0.
4914   // TODO: remove this after verifying the bug is not reproduced
4915   // on non-deprecated ptxas versions.
4916   if (ForceMinByValParamAlign)
4917     ArgAlign = std::max(ArgAlign, Align(4));
4918 
4919   return ArgAlign;
4920 }
4921 
4922 // Helper for getting a function parameter name. Name is composed from
4923 // its index and the function name. Negative index corresponds to special
4924 // parameter (unsized array) used for passing variable arguments.
4925 std::string NVPTXTargetLowering::getParamName(const Function *F,
4926                                               int Idx) const {
4927   std::string ParamName;
4928   raw_string_ostream ParamStr(ParamName);
4929 
4930   ParamStr << getTargetMachine().getSymbol(F)->getName();
4931   if (Idx < 0)
4932     ParamStr << "_vararg";
4933   else
4934     ParamStr << "_param_" << Idx;
4935 
4936   return ParamName;
4937 }
4938 
4939 /// isLegalAddressingMode - Return true if the addressing mode represented
4940 /// by AM is legal for this target, for a load/store of the specified type.
4941 /// Used to guide target specific optimizations, like loop strength reduction
4942 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4943 /// (CodeGenPrepare.cpp)
4944 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4945                                                 const AddrMode &AM, Type *Ty,
4946                                                 unsigned AS, Instruction *I) const {
4947   // AddrMode - This represents an addressing mode of:
4948   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4949   //
4950   // The legal address modes are
4951   // - [avar]
4952   // - [areg]
4953   // - [areg+immoff]
4954   // - [immAddr]
4955 
4956   if (AM.BaseGV) {
4957     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4958   }
4959 
4960   switch (AM.Scale) {
4961   case 0: // "r", "r+i" or "i" is allowed
4962     break;
4963   case 1:
4964     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4965       return false;
4966     // Otherwise we have r+i.
4967     break;
4968   default:
4969     // No scale > 1 is allowed
4970     return false;
4971   }
4972   return true;
4973 }
4974 
4975 //===----------------------------------------------------------------------===//
4976 //                         NVPTX Inline Assembly Support
4977 //===----------------------------------------------------------------------===//
4978 
4979 /// getConstraintType - Given a constraint letter, return the type of
4980 /// constraint it is for this target.
4981 NVPTXTargetLowering::ConstraintType
4982 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
4983   if (Constraint.size() == 1) {
4984     switch (Constraint[0]) {
4985     default:
4986       break;
4987     case 'b':
4988     case 'r':
4989     case 'h':
4990     case 'c':
4991     case 'l':
4992     case 'f':
4993     case 'd':
4994     case '0':
4995     case 'N':
4996       return C_RegisterClass;
4997     }
4998   }
4999   return TargetLowering::getConstraintType(Constraint);
5000 }
5001 
5002 std::pair<unsigned, const TargetRegisterClass *>
5003 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5004                                                   StringRef Constraint,
5005                                                   MVT VT) const {
5006   if (Constraint.size() == 1) {
5007     switch (Constraint[0]) {
5008     case 'b':
5009       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5010     case 'c':
5011       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5012     case 'h':
5013       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5014     case 'r':
5015       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5016     case 'l':
5017     case 'N':
5018       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5019     case 'f':
5020       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5021     case 'd':
5022       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5023     }
5024   }
5025   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5026 }
5027 
5028 //===----------------------------------------------------------------------===//
5029 //                         NVPTX DAG Combining
5030 //===----------------------------------------------------------------------===//
5031 
5032 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5033                                    CodeGenOptLevel OptLevel) const {
5034   // Always honor command-line argument
5035   if (FMAContractLevelOpt.getNumOccurrences() > 0)
5036     return FMAContractLevelOpt > 0;
5037 
5038   // Do not contract if we're not optimizing the code.
5039   if (OptLevel == CodeGenOptLevel::None)
5040     return false;
5041 
5042   // Honor TargetOptions flags that explicitly say fusion is okay.
5043   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5044     return true;
5045 
5046   return allowUnsafeFPMath(MF);
5047 }
5048 
5049 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5050   // Honor TargetOptions flags that explicitly say unsafe math is okay.
5051   if (MF.getTarget().Options.UnsafeFPMath)
5052     return true;
5053 
5054   // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5055   const Function &F = MF.getFunction();
5056   return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5057 }
5058 
5059 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5060 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
5061 /// called with the default operands, and if that fails, with commuted
5062 /// operands.
5063 static SDValue PerformADDCombineWithOperands(
5064     SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
5065     const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5066   SelectionDAG  &DAG = DCI.DAG;
5067   // Skip non-integer, non-scalar case
5068   EVT VT=N0.getValueType();
5069   if (VT.isVector())
5070     return SDValue();
5071 
5072   // fold (add (mul a, b), c) -> (mad a, b, c)
5073   //
5074   if (N0.getOpcode() == ISD::MUL) {
5075     assert (VT.isInteger());
5076     // For integer:
5077     // Since integer multiply-add costs the same as integer multiply
5078     // but is more costly than integer add, do the fusion only when
5079     // the mul is only used in the add.
5080     if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5081         !N0.getNode()->hasOneUse())
5082       return SDValue();
5083 
5084     // Do the folding
5085     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5086                        N0.getOperand(0), N0.getOperand(1), N1);
5087   }
5088   else if (N0.getOpcode() == ISD::FMUL) {
5089     if (VT == MVT::f32 || VT == MVT::f64) {
5090       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5091           &DAG.getTargetLoweringInfo());
5092       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
5093         return SDValue();
5094 
5095       // For floating point:
5096       // Do the fusion only when the mul has less than 5 uses and all
5097       // are add.
5098       // The heuristic is that if a use is not an add, then that use
5099       // cannot be fused into fma, therefore mul is still needed anyway.
5100       // If there are more than 4 uses, even if they are all add, fusing
5101       // them will increase register pressue.
5102       //
5103       int numUses = 0;
5104       int nonAddCount = 0;
5105       for (const SDNode *User : N0.getNode()->uses()) {
5106         numUses++;
5107         if (User->getOpcode() != ISD::FADD)
5108           ++nonAddCount;
5109       }
5110       if (numUses >= 5)
5111         return SDValue();
5112       if (nonAddCount) {
5113         int orderNo = N->getIROrder();
5114         int orderNo2 = N0.getNode()->getIROrder();
5115         // simple heuristics here for considering potential register
5116         // pressure, the logics here is that the differnce are used
5117         // to measure the distance between def and use, the longer distance
5118         // more likely cause register pressure.
5119         if (orderNo - orderNo2 < 500)
5120           return SDValue();
5121 
5122         // Now, check if at least one of the FMUL's operands is live beyond the node N,
5123         // which guarantees that the FMA will not increase register pressure at node N.
5124         bool opIsLive = false;
5125         const SDNode *left = N0.getOperand(0).getNode();
5126         const SDNode *right = N0.getOperand(1).getNode();
5127 
5128         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5129           opIsLive = true;
5130 
5131         if (!opIsLive)
5132           for (const SDNode *User : left->uses()) {
5133             int orderNo3 = User->getIROrder();
5134             if (orderNo3 > orderNo) {
5135               opIsLive = true;
5136               break;
5137             }
5138           }
5139 
5140         if (!opIsLive)
5141           for (const SDNode *User : right->uses()) {
5142             int orderNo3 = User->getIROrder();
5143             if (orderNo3 > orderNo) {
5144               opIsLive = true;
5145               break;
5146             }
5147           }
5148 
5149         if (!opIsLive)
5150           return SDValue();
5151       }
5152 
5153       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
5154                          N0.getOperand(0), N0.getOperand(1), N1);
5155     }
5156   }
5157 
5158   return SDValue();
5159 }
5160 
5161 static SDValue PerformStoreRetvalCombine(SDNode *N) {
5162   // Operands from the 2nd to the last one are the values to be stored
5163   for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5164     if (!N->getOperand(I).isUndef())
5165       return SDValue();
5166 
5167   // Operand 0 is the previous value in the chain. Cannot return EntryToken
5168   // as the previous value will become unused and eliminated later.
5169   return N->getOperand(0);
5170 }
5171 
5172 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5173 ///
5174 static SDValue PerformADDCombine(SDNode *N,
5175                                  TargetLowering::DAGCombinerInfo &DCI,
5176                                  const NVPTXSubtarget &Subtarget,
5177                                  CodeGenOptLevel OptLevel) {
5178   SDValue N0 = N->getOperand(0);
5179   SDValue N1 = N->getOperand(1);
5180 
5181   // First try with the default operand order.
5182   if (SDValue Result =
5183           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5184     return Result;
5185 
5186   // If that didn't work, try again with the operands commuted.
5187   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
5188 }
5189 
5190 static SDValue PerformANDCombine(SDNode *N,
5191                                  TargetLowering::DAGCombinerInfo &DCI) {
5192   // The type legalizer turns a vector load of i8 values into a zextload to i16
5193   // registers, optionally ANY_EXTENDs it (if target type is integer),
5194   // and ANDs off the high 8 bits. Since we turn this load into a
5195   // target-specific DAG node, the DAG combiner fails to eliminate these AND
5196   // nodes. Do that here.
5197   SDValue Val = N->getOperand(0);
5198   SDValue Mask = N->getOperand(1);
5199 
5200   if (isa<ConstantSDNode>(Val)) {
5201     std::swap(Val, Mask);
5202   }
5203 
5204   SDValue AExt;
5205 
5206   // Convert BFE-> truncate i16 -> and 255
5207   // To just BFE-> truncate i16, as the value already has all the bits in the
5208   // right places.
5209   if (Val.getOpcode() == ISD::TRUNCATE) {
5210     SDValue BFE = Val.getOperand(0);
5211     if (BFE.getOpcode() != NVPTXISD::BFE)
5212       return SDValue();
5213 
5214     ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5215     if (!BFEBits)
5216       return SDValue();
5217     uint64_t BFEBitsVal = BFEBits->getZExtValue();
5218 
5219     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5220     if (!MaskCnst) {
5221       // Not an AND with a constant
5222       return SDValue();
5223     }
5224     uint64_t MaskVal = MaskCnst->getZExtValue();
5225 
5226     if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5227       return SDValue();
5228     // If we get here, the AND is unnecessary.  Just replace it with the trunc
5229     DCI.CombineTo(N, Val, false);
5230   }
5231   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5232   if (Val.getOpcode() == ISD::ANY_EXTEND) {
5233     AExt = Val;
5234     Val = Val->getOperand(0);
5235   }
5236 
5237   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5238     Val = Val->getOperand(0);
5239   }
5240 
5241   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5242       Val->getOpcode() == NVPTXISD::LoadV4) {
5243     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5244     if (!MaskCnst) {
5245       // Not an AND with a constant
5246       return SDValue();
5247     }
5248 
5249     uint64_t MaskVal = MaskCnst->getZExtValue();
5250     if (MaskVal != 0xff) {
5251       // Not an AND that chops off top 8 bits
5252       return SDValue();
5253     }
5254 
5255     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5256     if (!Mem) {
5257       // Not a MemSDNode?!?
5258       return SDValue();
5259     }
5260 
5261     EVT MemVT = Mem->getMemoryVT();
5262     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5263       // We only handle the i8 case
5264       return SDValue();
5265     }
5266 
5267     unsigned ExtType =
5268       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
5269         getZExtValue();
5270     if (ExtType == ISD::SEXTLOAD) {
5271       // If for some reason the load is a sextload, the and is needed to zero
5272       // out the high 8 bits
5273       return SDValue();
5274     }
5275 
5276     bool AddTo = false;
5277     if (AExt.getNode() != nullptr) {
5278       // Re-insert the ext as a zext.
5279       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5280                             AExt.getValueType(), Val);
5281       AddTo = true;
5282     }
5283 
5284     // If we get here, the AND is unnecessary.  Just replace it with the load
5285     DCI.CombineTo(N, Val, AddTo);
5286   }
5287 
5288   return SDValue();
5289 }
5290 
5291 static SDValue PerformREMCombine(SDNode *N,
5292                                  TargetLowering::DAGCombinerInfo &DCI,
5293                                  CodeGenOptLevel OptLevel) {
5294   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5295 
5296   // Don't do anything at less than -O2.
5297   if (OptLevel < CodeGenOptLevel::Default)
5298     return SDValue();
5299 
5300   SelectionDAG &DAG = DCI.DAG;
5301   SDLoc DL(N);
5302   EVT VT = N->getValueType(0);
5303   bool IsSigned = N->getOpcode() == ISD::SREM;
5304   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5305 
5306   const SDValue &Num = N->getOperand(0);
5307   const SDValue &Den = N->getOperand(1);
5308 
5309   for (const SDNode *U : Num->uses()) {
5310     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5311         U->getOperand(1) == Den) {
5312       // Num % Den -> Num - (Num / Den) * Den
5313       return DAG.getNode(ISD::SUB, DL, VT, Num,
5314                          DAG.getNode(ISD::MUL, DL, VT,
5315                                      DAG.getNode(DivOpc, DL, VT, Num, Den),
5316                                      Den));
5317     }
5318   }
5319   return SDValue();
5320 }
5321 
5322 enum OperandSignedness {
5323   Signed = 0,
5324   Unsigned,
5325   Unknown
5326 };
5327 
5328 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5329 /// that can be demoted to \p OptSize bits without loss of information. The
5330 /// signedness of the operand, if determinable, is placed in \p S.
5331 static bool IsMulWideOperandDemotable(SDValue Op,
5332                                       unsigned OptSize,
5333                                       OperandSignedness &S) {
5334   S = Unknown;
5335 
5336   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5337       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5338     EVT OrigVT = Op.getOperand(0).getValueType();
5339     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5340       S = Signed;
5341       return true;
5342     }
5343   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5344     EVT OrigVT = Op.getOperand(0).getValueType();
5345     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5346       S = Unsigned;
5347       return true;
5348     }
5349   }
5350 
5351   return false;
5352 }
5353 
5354 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5355 /// be demoted to \p OptSize bits without loss of information. If the operands
5356 /// contain a constant, it should appear as the RHS operand. The signedness of
5357 /// the operands is placed in \p IsSigned.
5358 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5359                                         unsigned OptSize,
5360                                         bool &IsSigned) {
5361   OperandSignedness LHSSign;
5362 
5363   // The LHS operand must be a demotable op
5364   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5365     return false;
5366 
5367   // We should have been able to determine the signedness from the LHS
5368   if (LHSSign == Unknown)
5369     return false;
5370 
5371   IsSigned = (LHSSign == Signed);
5372 
5373   // The RHS can be a demotable op or a constant
5374   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5375     const APInt &Val = CI->getAPIntValue();
5376     if (LHSSign == Unsigned) {
5377       return Val.isIntN(OptSize);
5378     } else {
5379       return Val.isSignedIntN(OptSize);
5380     }
5381   } else {
5382     OperandSignedness RHSSign;
5383     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5384       return false;
5385 
5386     return LHSSign == RHSSign;
5387   }
5388 }
5389 
5390 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5391 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5392 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5393 /// amount.
5394 static SDValue TryMULWIDECombine(SDNode *N,
5395                                  TargetLowering::DAGCombinerInfo &DCI) {
5396   EVT MulType = N->getValueType(0);
5397   if (MulType != MVT::i32 && MulType != MVT::i64) {
5398     return SDValue();
5399   }
5400 
5401   SDLoc DL(N);
5402   unsigned OptSize = MulType.getSizeInBits() >> 1;
5403   SDValue LHS = N->getOperand(0);
5404   SDValue RHS = N->getOperand(1);
5405 
5406   // Canonicalize the multiply so the constant (if any) is on the right
5407   if (N->getOpcode() == ISD::MUL) {
5408     if (isa<ConstantSDNode>(LHS)) {
5409       std::swap(LHS, RHS);
5410     }
5411   }
5412 
5413   // If we have a SHL, determine the actual multiply amount
5414   if (N->getOpcode() == ISD::SHL) {
5415     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5416     if (!ShlRHS) {
5417       return SDValue();
5418     }
5419 
5420     APInt ShiftAmt = ShlRHS->getAPIntValue();
5421     unsigned BitWidth = MulType.getSizeInBits();
5422     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5423       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5424       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5425     } else {
5426       return SDValue();
5427     }
5428   }
5429 
5430   bool Signed;
5431   // Verify that our operands are demotable
5432   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5433     return SDValue();
5434   }
5435 
5436   EVT DemotedVT;
5437   if (MulType == MVT::i32) {
5438     DemotedVT = MVT::i16;
5439   } else {
5440     DemotedVT = MVT::i32;
5441   }
5442 
5443   // Truncate the operands to the correct size. Note that these are just for
5444   // type consistency and will (likely) be eliminated in later phases.
5445   SDValue TruncLHS =
5446     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5447   SDValue TruncRHS =
5448     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5449 
5450   unsigned Opc;
5451   if (Signed) {
5452     Opc = NVPTXISD::MUL_WIDE_SIGNED;
5453   } else {
5454     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5455   }
5456 
5457   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5458 }
5459 
5460 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5461 static SDValue PerformMULCombine(SDNode *N,
5462                                  TargetLowering::DAGCombinerInfo &DCI,
5463                                  CodeGenOptLevel OptLevel) {
5464   if (OptLevel > CodeGenOptLevel::None) {
5465     // Try mul.wide combining at OptLevel > 0
5466     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5467       return Ret;
5468   }
5469 
5470   return SDValue();
5471 }
5472 
5473 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5474 static SDValue PerformSHLCombine(SDNode *N,
5475                                  TargetLowering::DAGCombinerInfo &DCI,
5476                                  CodeGenOptLevel OptLevel) {
5477   if (OptLevel > CodeGenOptLevel::None) {
5478     // Try mul.wide combining at OptLevel > 0
5479     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5480       return Ret;
5481   }
5482 
5483   return SDValue();
5484 }
5485 
5486 static SDValue PerformSETCCCombine(SDNode *N,
5487                                    TargetLowering::DAGCombinerInfo &DCI,
5488                                    unsigned int SmVersion) {
5489   EVT CCType = N->getValueType(0);
5490   SDValue A = N->getOperand(0);
5491   SDValue B = N->getOperand(1);
5492 
5493   EVT AType = A.getValueType();
5494   if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5495     return SDValue();
5496 
5497   if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5498     return SDValue();
5499 
5500   SDLoc DL(N);
5501   // setp.f16x2 returns two scalar predicates, which we need to
5502   // convert back to v2i1. The returned result will be scalarized by
5503   // the legalizer, but the comparison will remain a single vector
5504   // instruction.
5505   SDValue CCNode = DCI.DAG.getNode(
5506       A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5507                                      : NVPTXISD::SETP_BF16X2,
5508       DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5509   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5510                          CCNode.getValue(1));
5511 }
5512 
5513 static SDValue PerformEXTRACTCombine(SDNode *N,
5514                                      TargetLowering::DAGCombinerInfo &DCI) {
5515   SDValue Vector = N->getOperand(0);
5516   SDLoc DL(N);
5517   EVT VectorVT = Vector.getValueType();
5518   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5519       IsPTXVectorType(VectorVT.getSimpleVT()))
5520     return SDValue(); // Native vector loads already combine nicely w/
5521                       // extract_vector_elt, except for v4i8.
5522   // Don't mess with singletons or v2*16 types, we already handle them OK.
5523   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5524       VectorVT == MVT::v4i8)
5525     return SDValue();
5526 
5527   uint64_t VectorBits = VectorVT.getSizeInBits();
5528   // We only handle the types we can extract in-register.
5529   if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5530     return SDValue();
5531 
5532   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5533   // Index == 0 is handled by generic DAG combiner.
5534   if (!Index || Index->getZExtValue() == 0)
5535     return SDValue();
5536 
5537   MVT IVT = MVT::getIntegerVT(VectorBits);
5538   EVT EltVT = VectorVT.getVectorElementType();
5539   EVT EltIVT = EltVT.changeTypeToInteger();
5540   uint64_t EltBits = EltVT.getScalarSizeInBits();
5541 
5542   SDValue Result = DCI.DAG.getNode(
5543       ISD::TRUNCATE, DL, EltIVT,
5544       DCI.DAG.getNode(
5545           ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5546           DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5547 
5548   // If element has non-integer type, bitcast it back to the expected type.
5549   if (EltVT != EltIVT)
5550     Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5551   // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5552   if (EltVT != N->getValueType(0))
5553     Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5554 
5555   return Result;
5556 }
5557 
5558 static SDValue PerformVSELECTCombine(SDNode *N,
5559                                      TargetLowering::DAGCombinerInfo &DCI) {
5560   SDValue VA = N->getOperand(1);
5561   EVT VectorVT = VA.getValueType();
5562   if (VectorVT != MVT::v4i8)
5563     return SDValue();
5564 
5565   // We need to split vselect into individual per-element operations Because we
5566   // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5567   // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5568   // to/from i16 normally used for i8 values.
5569   SmallVector<SDValue, 4> E;
5570   SDLoc DL(N);
5571   SDValue VCond = N->getOperand(0);
5572   SDValue VB = N->getOperand(2);
5573   for (int I = 0; I < 4; ++I) {
5574     SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5575                                 DCI.DAG.getConstant(I, DL, MVT::i32));
5576     SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5577         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5578                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5579         DL, MVT::i32);
5580     SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5581         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5582                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5583         DL, MVT::i32);
5584     E.push_back(DCI.DAG.getAnyExtOrTrunc(
5585         DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5586   }
5587   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5588 }
5589 
5590 static SDValue PerformLOADCombine(SDNode *N,
5591                                   TargetLowering::DAGCombinerInfo &DCI) {
5592   SelectionDAG &DAG = DCI.DAG;
5593   LoadSDNode *LD = cast<LoadSDNode>(N);
5594 
5595   // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5596   // letting ReplaceLoadVector split it into smaller loads during legalization.
5597   // This is done at dag-combine1 time, so that vector operations with i8
5598   // elements can be optimised away instead of being needlessly split during
5599   // legalization, which involves storing to the stack and loading it back.
5600   EVT VT = N->getValueType(0);
5601   if (VT != MVT::v16i8)
5602     return SDValue();
5603 
5604   SDLoc DL(N);
5605 
5606   // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5607   unsigned Opc = NVPTXISD::LoadV4;
5608   EVT NewVT = MVT::v4i32;
5609   EVT EltVT = NewVT.getVectorElementType();
5610   unsigned NumElts = NewVT.getVectorNumElements();
5611   EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5612   SDVTList RetVTList = DAG.getVTList(RetVTs);
5613   SmallVector<SDValue, 8> Ops(N->ops());
5614   Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5615   SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5616                                             LD->getMemOperand());
5617   SDValue NewChain = NewLoad.getValue(NumElts);
5618 
5619   // Create a vector of the same type returned by the original load.
5620   SmallVector<SDValue, 4> Elts;
5621   for (unsigned i = 0; i < NumElts; i++)
5622     Elts.push_back(NewLoad.getValue(i));
5623   return DCI.DAG.getMergeValues(
5624       {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5625        NewChain},
5626       DL);
5627 }
5628 
5629 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5630                                                DAGCombinerInfo &DCI) const {
5631   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
5632   switch (N->getOpcode()) {
5633     default: break;
5634     case ISD::ADD:
5635     case ISD::FADD:
5636       return PerformADDCombine(N, DCI, STI, OptLevel);
5637     case ISD::MUL:
5638       return PerformMULCombine(N, DCI, OptLevel);
5639     case ISD::SHL:
5640       return PerformSHLCombine(N, DCI, OptLevel);
5641     case ISD::AND:
5642       return PerformANDCombine(N, DCI);
5643     case ISD::UREM:
5644     case ISD::SREM:
5645       return PerformREMCombine(N, DCI, OptLevel);
5646     case ISD::SETCC:
5647       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5648     case ISD::LOAD:
5649       return PerformLOADCombine(N, DCI);
5650     case NVPTXISD::StoreRetval:
5651     case NVPTXISD::StoreRetvalV2:
5652     case NVPTXISD::StoreRetvalV4:
5653       return PerformStoreRetvalCombine(N);
5654     case ISD::EXTRACT_VECTOR_ELT:
5655       return PerformEXTRACTCombine(N, DCI);
5656     case ISD::VSELECT:
5657       return PerformVSELECTCombine(N, DCI);
5658   }
5659   return SDValue();
5660 }
5661 
5662 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5663 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
5664                               SmallVectorImpl<SDValue> &Results) {
5665   EVT ResVT = N->getValueType(0);
5666   SDLoc DL(N);
5667 
5668   assert(ResVT.isVector() && "Vector load must have vector type");
5669 
5670   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5671   // legal.  We can (and should) split that into 2 loads of <2 x double> here
5672   // but I'm leaving that as a TODO for now.
5673   assert(ResVT.isSimple() && "Can only handle simple types");
5674   switch (ResVT.getSimpleVT().SimpleTy) {
5675   default:
5676     return;
5677   case MVT::v2i8:
5678   case MVT::v2i16:
5679   case MVT::v2i32:
5680   case MVT::v2i64:
5681   case MVT::v2f16:
5682   case MVT::v2f32:
5683   case MVT::v2f64:
5684   case MVT::v4i8:
5685   case MVT::v4i16:
5686   case MVT::v4i32:
5687   case MVT::v4f16:
5688   case MVT::v4f32:
5689   case MVT::v8f16:  // <4 x f16x2>
5690   case MVT::v8bf16: // <4 x bf16x2>
5691   case MVT::v8i16:  // <4 x i16x2>
5692     // This is a "native" vector type
5693     break;
5694   }
5695 
5696   LoadSDNode *LD = cast<LoadSDNode>(N);
5697 
5698   Align Alignment = LD->getAlign();
5699   auto &TD = DAG.getDataLayout();
5700   Align PrefAlign =
5701       TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5702   if (Alignment < PrefAlign) {
5703     // This load is not sufficiently aligned, so bail out and let this vector
5704     // load be scalarized.  Note that we may still be able to emit smaller
5705     // vector loads.  For example, if we are loading a <4 x float> with an
5706     // alignment of 8, this check will fail but the legalizer will try again
5707     // with 2 x <2 x float>, which will succeed with an alignment of 8.
5708     return;
5709   }
5710 
5711   EVT EltVT = ResVT.getVectorElementType();
5712   unsigned NumElts = ResVT.getVectorNumElements();
5713 
5714   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5715   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5716   // loaded type to i16 and propagate the "real" type as the memory type.
5717   bool NeedTrunc = false;
5718   if (EltVT.getSizeInBits() < 16) {
5719     EltVT = MVT::i16;
5720     NeedTrunc = true;
5721   }
5722 
5723   unsigned Opcode = 0;
5724   SDVTList LdResVTs;
5725   bool Load16x2 = false;
5726 
5727   switch (NumElts) {
5728   default:
5729     return;
5730   case 2:
5731     Opcode = NVPTXISD::LoadV2;
5732     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5733     break;
5734   case 4: {
5735     Opcode = NVPTXISD::LoadV4;
5736     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5737     LdResVTs = DAG.getVTList(ListVTs);
5738     break;
5739   }
5740   case 8: {
5741     // v8f16 is a special case. PTX doesn't have ld.v8.f16
5742     // instruction. Instead, we split the vector into v2f16 chunks and
5743     // load them with ld.v4.b32.
5744     assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5745     Load16x2 = true;
5746     Opcode = NVPTXISD::LoadV4;
5747     EVT VVT;
5748     switch (EltVT.getSimpleVT().SimpleTy) {
5749     case MVT::f16:
5750       VVT = MVT::v2f16;
5751       break;
5752     case MVT::bf16:
5753       VVT = MVT::v2bf16;
5754       break;
5755     case MVT::i16:
5756       VVT = MVT::v2i16;
5757       break;
5758     default:
5759       llvm_unreachable("Unsupported v8 vector type.");
5760     }
5761     EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5762     LdResVTs = DAG.getVTList(ListVTs);
5763     break;
5764   }
5765   }
5766 
5767   // Copy regular operands
5768   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5769 
5770   // The select routine does not have access to the LoadSDNode instance, so
5771   // pass along the extension information
5772   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5773 
5774   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5775                                           LD->getMemoryVT(),
5776                                           LD->getMemOperand());
5777 
5778   SmallVector<SDValue, 8> ScalarRes;
5779   if (Load16x2) {
5780     // Split v2f16 subvectors back into individual elements.
5781     NumElts /= 2;
5782     for (unsigned i = 0; i < NumElts; ++i) {
5783       SDValue SubVector = NewLD.getValue(i);
5784       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5785                                DAG.getIntPtrConstant(0, DL));
5786       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5787                                DAG.getIntPtrConstant(1, DL));
5788       ScalarRes.push_back(E0);
5789       ScalarRes.push_back(E1);
5790     }
5791   } else {
5792     for (unsigned i = 0; i < NumElts; ++i) {
5793       SDValue Res = NewLD.getValue(i);
5794       if (NeedTrunc)
5795         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5796       ScalarRes.push_back(Res);
5797     }
5798   }
5799 
5800   SDValue LoadChain = NewLD.getValue(NumElts);
5801 
5802   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5803 
5804   Results.push_back(BuildVec);
5805   Results.push_back(LoadChain);
5806 }
5807 
5808 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
5809                                      SmallVectorImpl<SDValue> &Results) {
5810   SDValue Chain = N->getOperand(0);
5811   SDValue Intrin = N->getOperand(1);
5812   SDLoc DL(N);
5813 
5814   // Get the intrinsic ID
5815   unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5816   switch (IntrinNo) {
5817   default:
5818     return;
5819   case Intrinsic::nvvm_ldg_global_i:
5820   case Intrinsic::nvvm_ldg_global_f:
5821   case Intrinsic::nvvm_ldg_global_p:
5822   case Intrinsic::nvvm_ldu_global_i:
5823   case Intrinsic::nvvm_ldu_global_f:
5824   case Intrinsic::nvvm_ldu_global_p: {
5825     EVT ResVT = N->getValueType(0);
5826 
5827     if (ResVT.isVector()) {
5828       // Vector LDG/LDU
5829 
5830       unsigned NumElts = ResVT.getVectorNumElements();
5831       EVT EltVT = ResVT.getVectorElementType();
5832 
5833       // Since LDU/LDG are target nodes, we cannot rely on DAG type
5834       // legalization.
5835       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5836       // loaded type to i16 and propagate the "real" type as the memory type.
5837       bool NeedTrunc = false;
5838       if (EltVT.getSizeInBits() < 16) {
5839         EltVT = MVT::i16;
5840         NeedTrunc = true;
5841       }
5842 
5843       unsigned Opcode = 0;
5844       SDVTList LdResVTs;
5845 
5846       switch (NumElts) {
5847       default:
5848         return;
5849       case 2:
5850         switch (IntrinNo) {
5851         default:
5852           return;
5853         case Intrinsic::nvvm_ldg_global_i:
5854         case Intrinsic::nvvm_ldg_global_f:
5855         case Intrinsic::nvvm_ldg_global_p:
5856           Opcode = NVPTXISD::LDGV2;
5857           break;
5858         case Intrinsic::nvvm_ldu_global_i:
5859         case Intrinsic::nvvm_ldu_global_f:
5860         case Intrinsic::nvvm_ldu_global_p:
5861           Opcode = NVPTXISD::LDUV2;
5862           break;
5863         }
5864         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5865         break;
5866       case 4: {
5867         switch (IntrinNo) {
5868         default:
5869           return;
5870         case Intrinsic::nvvm_ldg_global_i:
5871         case Intrinsic::nvvm_ldg_global_f:
5872         case Intrinsic::nvvm_ldg_global_p:
5873           Opcode = NVPTXISD::LDGV4;
5874           break;
5875         case Intrinsic::nvvm_ldu_global_i:
5876         case Intrinsic::nvvm_ldu_global_f:
5877         case Intrinsic::nvvm_ldu_global_p:
5878           Opcode = NVPTXISD::LDUV4;
5879           break;
5880         }
5881         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5882         LdResVTs = DAG.getVTList(ListVTs);
5883         break;
5884       }
5885       }
5886 
5887       SmallVector<SDValue, 8> OtherOps;
5888 
5889       // Copy regular operands
5890 
5891       OtherOps.push_back(Chain); // Chain
5892                                  // Skip operand 1 (intrinsic ID)
5893       // Others
5894       OtherOps.append(N->op_begin() + 2, N->op_end());
5895 
5896       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5897 
5898       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5899                                               MemSD->getMemoryVT(),
5900                                               MemSD->getMemOperand());
5901 
5902       SmallVector<SDValue, 4> ScalarRes;
5903 
5904       for (unsigned i = 0; i < NumElts; ++i) {
5905         SDValue Res = NewLD.getValue(i);
5906         if (NeedTrunc)
5907           Res =
5908               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5909         ScalarRes.push_back(Res);
5910       }
5911 
5912       SDValue LoadChain = NewLD.getValue(NumElts);
5913 
5914       SDValue BuildVec =
5915           DAG.getBuildVector(ResVT, DL, ScalarRes);
5916 
5917       Results.push_back(BuildVec);
5918       Results.push_back(LoadChain);
5919     } else {
5920       // i8 LDG/LDU
5921       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5922              "Custom handling of non-i8 ldu/ldg?");
5923 
5924       // Just copy all operands as-is
5925       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5926 
5927       // Force output to i16
5928       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5929 
5930       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5931 
5932       // We make sure the memory type is i8, which will be used during isel
5933       // to select the proper instruction.
5934       SDValue NewLD =
5935           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5936                                   MVT::i8, MemSD->getMemOperand());
5937 
5938       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5939                                     NewLD.getValue(0)));
5940       Results.push_back(NewLD.getValue(1));
5941     }
5942   }
5943   }
5944 }
5945 
5946 void NVPTXTargetLowering::ReplaceNodeResults(
5947     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5948   switch (N->getOpcode()) {
5949   default:
5950     report_fatal_error("Unhandled custom legalization");
5951   case ISD::LOAD:
5952     ReplaceLoadVector(N, DAG, Results);
5953     return;
5954   case ISD::INTRINSIC_W_CHAIN:
5955     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5956     return;
5957   }
5958 }
5959 
5960 NVPTXTargetLowering::AtomicExpansionKind
5961 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
5962   Type *Ty = AI->getValOperand()->getType();
5963 
5964   if (AI->isFloatingPointOperation()) {
5965     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
5966       if (Ty->isFloatTy())
5967         return AtomicExpansionKind::None;
5968       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5969         return AtomicExpansionKind::None;
5970     }
5971     return AtomicExpansionKind::CmpXChg;
5972   }
5973 
5974   assert(Ty->isIntegerTy() && "Ty should be integer at this point");
5975   auto ITy = cast<llvm::IntegerType>(Ty);
5976 
5977   switch (AI->getOperation()) {
5978   default:
5979     return AtomicExpansionKind::CmpXChg;
5980   case AtomicRMWInst::BinOp::And:
5981   case AtomicRMWInst::BinOp::Or:
5982   case AtomicRMWInst::BinOp::Xor:
5983   case AtomicRMWInst::BinOp::Xchg:
5984     switch (ITy->getBitWidth()) {
5985     case 8:
5986     case 16:
5987       return AtomicExpansionKind::CmpXChg;
5988     case 32:
5989       return AtomicExpansionKind::None;
5990     case 64:
5991       if (STI.hasAtomBitwise64())
5992         return AtomicExpansionKind::None;
5993       return AtomicExpansionKind::CmpXChg;
5994     default:
5995       llvm_unreachable("unsupported width encountered");
5996     }
5997   case AtomicRMWInst::BinOp::Add:
5998   case AtomicRMWInst::BinOp::Sub:
5999   case AtomicRMWInst::BinOp::Max:
6000   case AtomicRMWInst::BinOp::Min:
6001   case AtomicRMWInst::BinOp::UMax:
6002   case AtomicRMWInst::BinOp::UMin:
6003     switch (ITy->getBitWidth()) {
6004     case 8:
6005     case 16:
6006       return AtomicExpansionKind::CmpXChg;
6007     case 32:
6008       return AtomicExpansionKind::None;
6009     case 64:
6010       if (STI.hasAtomMinMax64())
6011         return AtomicExpansionKind::None;
6012       return AtomicExpansionKind::CmpXChg;
6013     default:
6014       llvm_unreachable("unsupported width encountered");
6015     }
6016   }
6017 
6018   return AtomicExpansionKind::CmpXChg;
6019 }
6020 
6021 // Pin NVPTXTargetObjectFile's vtables to this file.
6022 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6023 
6024 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6025     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6026   return getDataSection();
6027 }
6028