xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (revision 46c59ea9b61755455ff6bf9f3e7b834e1af634ea)
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/ISDOpcodes.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineMemOperand.h"
29 #include "llvm/CodeGen/MachineValueType.h"
30 #include "llvm/CodeGen/SelectionDAG.h"
31 #include "llvm/CodeGen/SelectionDAGNodes.h"
32 #include "llvm/CodeGen/TargetCallingConv.h"
33 #include "llvm/CodeGen/TargetLowering.h"
34 #include "llvm/CodeGen/ValueTypes.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/FPEnv.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalValue.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicsNVPTX.h"
47 #include "llvm/IR/Module.h"
48 #include "llvm/IR/Type.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Casting.h"
51 #include "llvm/Support/CodeGen.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/raw_ostream.h"
55 #include "llvm/Target/TargetMachine.h"
56 #include "llvm/Target/TargetOptions.h"
57 #include <algorithm>
58 #include <cassert>
59 #include <cmath>
60 #include <cstdint>
61 #include <iterator>
62 #include <sstream>
63 #include <string>
64 #include <utility>
65 #include <vector>
66 
67 #define DEBUG_TYPE "nvptx-lower"
68 
69 using namespace llvm;
70 
71 static std::atomic<unsigned> GlobalUniqueCallSite;
72 
73 static cl::opt<bool> sched4reg(
74     "nvptx-sched4reg",
75     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
76 
77 static cl::opt<unsigned> FMAContractLevelOpt(
78     "nvptx-fma-level", cl::Hidden,
79     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
80              " 1: do it  2: do it aggressively"),
81     cl::init(2));
82 
83 static cl::opt<int> UsePrecDivF32(
84     "nvptx-prec-divf32", cl::Hidden,
85     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
86              " IEEE Compliant F32 div.rnd if available."),
87     cl::init(2));
88 
89 static cl::opt<bool> UsePrecSqrtF32(
90     "nvptx-prec-sqrtf32", cl::Hidden,
91     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
92     cl::init(true));
93 
94 static cl::opt<bool> ForceMinByValParamAlign(
95     "nvptx-force-min-byval-param-align", cl::Hidden,
96     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
97              " params of device functions."),
98     cl::init(false));
99 
100 int NVPTXTargetLowering::getDivF32Level() const {
101   if (UsePrecDivF32.getNumOccurrences() > 0) {
102     // If nvptx-prec-div32=N is used on the command-line, always honor it
103     return UsePrecDivF32;
104   } else {
105     // Otherwise, use div.approx if fast math is enabled
106     if (getTargetMachine().Options.UnsafeFPMath)
107       return 0;
108     else
109       return 2;
110   }
111 }
112 
113 bool NVPTXTargetLowering::usePrecSqrtF32() const {
114   if (UsePrecSqrtF32.getNumOccurrences() > 0) {
115     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
116     return UsePrecSqrtF32;
117   } else {
118     // Otherwise, use sqrt.approx if fast math is enabled
119     return !getTargetMachine().Options.UnsafeFPMath;
120   }
121 }
122 
123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
124   return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
125          DenormalMode::PreserveSign;
126 }
127 
128 static bool IsPTXVectorType(MVT VT) {
129   switch (VT.SimpleTy) {
130   default:
131     return false;
132   case MVT::v2i1:
133   case MVT::v4i1:
134   case MVT::v2i8:
135   case MVT::v4i8:
136   case MVT::v2i16:
137   case MVT::v4i16:
138   case MVT::v8i16: // <4 x i16x2>
139   case MVT::v2i32:
140   case MVT::v4i32:
141   case MVT::v2i64:
142   case MVT::v2f16:
143   case MVT::v4f16:
144   case MVT::v8f16: // <4 x f16x2>
145   case MVT::v2bf16:
146   case MVT::v4bf16:
147   case MVT::v8bf16: // <4 x bf16x2>
148   case MVT::v2f32:
149   case MVT::v4f32:
150   case MVT::v2f64:
151     return true;
152   }
153 }
154 
155 static bool Is16bitsType(MVT VT) {
156   return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
157           VT.SimpleTy == MVT::i16);
158 }
159 
160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
161 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
162 /// into their primitive components.
163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
165 /// LowerCall, and LowerReturn.
166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
167                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
168                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
169                                uint64_t StartingOffset = 0) {
170   SmallVector<EVT, 16> TempVTs;
171   SmallVector<uint64_t, 16> TempOffsets;
172 
173   // Special case for i128 - decompose to (i64, i64)
174   if (Ty->isIntegerTy(128)) {
175     ValueVTs.push_back(EVT(MVT::i64));
176     ValueVTs.push_back(EVT(MVT::i64));
177 
178     if (Offsets) {
179       Offsets->push_back(StartingOffset + 0);
180       Offsets->push_back(StartingOffset + 8);
181     }
182 
183     return;
184   }
185 
186   // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
187   if (StructType *STy = dyn_cast<StructType>(Ty)) {
188     auto const *SL = DL.getStructLayout(STy);
189     auto ElementNum = 0;
190     for(auto *EI : STy->elements()) {
191       ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
192                          StartingOffset + SL->getElementOffset(ElementNum));
193       ++ElementNum;
194     }
195     return;
196   }
197 
198   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
199   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
200     EVT VT = TempVTs[i];
201     uint64_t Off = TempOffsets[i];
202     // Split vectors into individual elements, except for v2f16, which
203     // we will pass as a single scalar.
204     if (VT.isVector()) {
205       unsigned NumElts = VT.getVectorNumElements();
206       EVT EltVT = VT.getVectorElementType();
207       // Vectors with an even number of f16 elements will be passed to
208       // us as an array of v2f16/v2bf16 elements. We must match this so we
209       // stay in sync with Ins/Outs.
210       if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
211         switch (EltVT.getSimpleVT().SimpleTy) {
212         case MVT::f16:
213           EltVT = MVT::v2f16;
214           break;
215         case MVT::bf16:
216           EltVT = MVT::v2bf16;
217           break;
218         case MVT::i16:
219           EltVT = MVT::v2i16;
220           break;
221         default:
222           llvm_unreachable("Unexpected type");
223         }
224         NumElts /= 2;
225       } else if (EltVT.getSimpleVT() == MVT::i8 &&
226                  (NumElts % 4 == 0 || NumElts == 3)) {
227         // v*i8 are formally lowered as v4i8
228         EltVT = MVT::v4i8;
229         NumElts = (NumElts + 3) / 4;
230       }
231       for (unsigned j = 0; j != NumElts; ++j) {
232         ValueVTs.push_back(EltVT);
233         if (Offsets)
234           Offsets->push_back(Off + j * EltVT.getStoreSize());
235       }
236     } else {
237       ValueVTs.push_back(VT);
238       if (Offsets)
239         Offsets->push_back(Off);
240     }
241   }
242 }
243 
244 /// PromoteScalarIntegerPTX
245 /// Used to make sure the arguments/returns are suitable for passing
246 /// and promote them to a larger size if they're not.
247 ///
248 /// The promoted type is placed in \p PromoteVT if the function returns true.
249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
250   if (VT.isScalarInteger()) {
251     switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
252     default:
253       llvm_unreachable(
254           "Promotion is not suitable for scalars of size larger than 64-bits");
255     case 1:
256       *PromotedVT = MVT::i1;
257       break;
258     case 2:
259     case 4:
260     case 8:
261       *PromotedVT = MVT::i8;
262       break;
263     case 16:
264       *PromotedVT = MVT::i16;
265       break;
266     case 32:
267       *PromotedVT = MVT::i32;
268       break;
269     case 64:
270       *PromotedVT = MVT::i64;
271       break;
272     }
273     return EVT(*PromotedVT) != VT;
274   }
275   return false;
276 }
277 
278 // Check whether we can merge loads/stores of some of the pieces of a
279 // flattened function parameter or return value into a single vector
280 // load/store.
281 //
282 // The flattened parameter is represented as a list of EVTs and
283 // offsets, and the whole structure is aligned to ParamAlignment. This
284 // function determines whether we can load/store pieces of the
285 // parameter starting at index Idx using a single vectorized op of
286 // size AccessSize. If so, it returns the number of param pieces
287 // covered by the vector op. Otherwise, it returns 1.
288 static unsigned CanMergeParamLoadStoresStartingAt(
289     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
290     const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
291 
292   // Can't vectorize if param alignment is not sufficient.
293   if (ParamAlignment < AccessSize)
294     return 1;
295   // Can't vectorize if offset is not aligned.
296   if (Offsets[Idx] & (AccessSize - 1))
297     return 1;
298 
299   EVT EltVT = ValueVTs[Idx];
300   unsigned EltSize = EltVT.getStoreSize();
301 
302   // Element is too large to vectorize.
303   if (EltSize >= AccessSize)
304     return 1;
305 
306   unsigned NumElts = AccessSize / EltSize;
307   // Can't vectorize if AccessBytes if not a multiple of EltSize.
308   if (AccessSize != EltSize * NumElts)
309     return 1;
310 
311   // We don't have enough elements to vectorize.
312   if (Idx + NumElts > ValueVTs.size())
313     return 1;
314 
315   // PTX ISA can only deal with 2- and 4-element vector ops.
316   if (NumElts != 4 && NumElts != 2)
317     return 1;
318 
319   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
320     // Types do not match.
321     if (ValueVTs[j] != EltVT)
322       return 1;
323 
324     // Elements are not contiguous.
325     if (Offsets[j] - Offsets[j - 1] != EltSize)
326       return 1;
327   }
328   // OK. We can vectorize ValueVTs[i..i+NumElts)
329   return NumElts;
330 }
331 
332 // Flags for tracking per-element vectorization state of loads/stores
333 // of a flattened function parameter or return value.
334 enum ParamVectorizationFlags {
335   PVF_INNER = 0x0, // Middle elements of a vector.
336   PVF_FIRST = 0x1, // First element of the vector.
337   PVF_LAST = 0x2,  // Last element of the vector.
338   // Scalar is effectively a 1-element vector.
339   PVF_SCALAR = PVF_FIRST | PVF_LAST
340 };
341 
342 // Computes whether and how we can vectorize the loads/stores of a
343 // flattened function parameter or return value.
344 //
345 // The flattened parameter is represented as the list of ValueVTs and
346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
347 // of the same size as ValueVTs indicating how each piece should be
348 // loaded/stored (i.e. as a scalar, or as part of a vector
349 // load/store).
350 static SmallVector<ParamVectorizationFlags, 16>
351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
352                      const SmallVectorImpl<uint64_t> &Offsets,
353                      Align ParamAlignment, bool IsVAArg = false) {
354   // Set vector size to match ValueVTs and mark all elements as
355   // scalars by default.
356   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
357   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
358 
359   if (IsVAArg)
360     return VectorInfo;
361 
362   // Check what we can vectorize using 128/64/32-bit accesses.
363   for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
364     // Skip elements we've already processed.
365     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
366     for (unsigned AccessSize : {16, 8, 4, 2}) {
367       unsigned NumElts = CanMergeParamLoadStoresStartingAt(
368           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
369       // Mark vectorized elements.
370       switch (NumElts) {
371       default:
372         llvm_unreachable("Unexpected return value");
373       case 1:
374         // Can't vectorize using this size, try next smaller size.
375         continue;
376       case 2:
377         assert(I + 1 < E && "Not enough elements.");
378         VectorInfo[I] = PVF_FIRST;
379         VectorInfo[I + 1] = PVF_LAST;
380         I += 1;
381         break;
382       case 4:
383         assert(I + 3 < E && "Not enough elements.");
384         VectorInfo[I] = PVF_FIRST;
385         VectorInfo[I + 1] = PVF_INNER;
386         VectorInfo[I + 2] = PVF_INNER;
387         VectorInfo[I + 3] = PVF_LAST;
388         I += 3;
389         break;
390       }
391       // Break out of the inner loop because we've already succeeded
392       // using largest possible AccessSize.
393       break;
394     }
395   }
396   return VectorInfo;
397 }
398 
399 // NVPTXTargetLowering Constructor.
400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
401                                          const NVPTXSubtarget &STI)
402     : TargetLowering(TM), nvTM(&TM), STI(STI) {
403   // always lower memset, memcpy, and memmove intrinsics to load/store
404   // instructions, rather
405   // then generating calls to memset, mempcy or memmove.
406   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
407   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
408   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
409 
410   setBooleanContents(ZeroOrNegativeOneBooleanContent);
411   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
412 
413   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
414   // condition branches.
415   setJumpIsExpensive(true);
416 
417   // Wide divides are _very_ slow. Try to reduce the width of the divide if
418   // possible.
419   addBypassSlowDiv(64, 32);
420 
421   // By default, use the Source scheduling
422   if (sched4reg)
423     setSchedulingPreference(Sched::RegPressure);
424   else
425     setSchedulingPreference(Sched::Source);
426 
427   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
428                                     LegalizeAction NoF16Action) {
429     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
430   };
431 
432   auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
433                                     LegalizeAction NoBF16Action) {
434     bool IsOpSupported = STI.hasBF16Math();
435     // Few instructions are available on sm_90 only
436     switch(Op) {
437       case ISD::FADD:
438       case ISD::FMUL:
439       case ISD::FSUB:
440       case ISD::SELECT:
441       case ISD::SELECT_CC:
442       case ISD::SETCC:
443       case ISD::FEXP2:
444       case ISD::FCEIL:
445       case ISD::FFLOOR:
446       case ISD::FNEARBYINT:
447       case ISD::FRINT:
448       case ISD::FTRUNC:
449         IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
450         break;
451     }
452     setOperationAction(
453         Op, VT, IsOpSupported ? Action : NoBF16Action);
454   };
455 
456   auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
457                                      LegalizeAction NoI16x2Action) {
458     bool IsOpSupported = false;
459     // instructions are available on sm_90 only
460     switch (Op) {
461     case ISD::ADD:
462     case ISD::SMAX:
463     case ISD::SMIN:
464     case ISD::UMIN:
465     case ISD::UMAX:
466     case ISD::SUB:
467       IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
468       break;
469     }
470     setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
471   };
472 
473   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
474   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
475   addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
476   addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
477   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
478   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
479   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
480   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
481   addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
482   addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
483   addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
484   addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
485 
486   // Conversion to/from FP16/FP16x2 is always legal.
487   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
488   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
489   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
490   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
491 
492   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
493   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
494 
495   // Conversion to/from BFP16/BFP16x2 is always legal.
496   setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
497   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
498   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
499   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
500 
501   setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
502   setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
503   if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
504     AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
505 
506   // Conversion to/from i16/i16x2 is always legal.
507   setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
508   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
509   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
510   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
511 
512   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
513   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
514   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
515   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
516   // Only logical ops can be done on v4i8 directly, others must be done
517   // elementwise.
518   setOperationAction(
519       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
520        ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,
521        ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,
522        ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,
523        ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,
524        ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,
525        ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,
526        ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,
527        ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,
528        ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,
529        ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,
530        ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,
531        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
532        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
533        ISD::USUBSAT},
534       MVT::v4i8, Expand);
535 
536   // Operations not directly supported by NVPTX.
537   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
538                  MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
539                  MVT::i32, MVT::i64}) {
540     setOperationAction(ISD::SELECT_CC, VT, Expand);
541     setOperationAction(ISD::BR_CC, VT, Expand);
542   }
543 
544   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
545   // For others we will expand to a SHL/SRA pair.
546   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
547   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
548   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
549   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
550   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
551   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
552 
553   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
554   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
555   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
556   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
557   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
558   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
559 
560   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
561   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
562 
563   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
564   // that don't have h/w rotation we lower them to multi-instruction assembly.
565   // See ROT*_sw in NVPTXIntrInfo.td
566   setOperationAction(ISD::ROTL, MVT::i64, Legal);
567   setOperationAction(ISD::ROTR, MVT::i64, Legal);
568   setOperationAction(ISD::ROTL, MVT::i32, Legal);
569   setOperationAction(ISD::ROTR, MVT::i32, Legal);
570 
571   setOperationAction(ISD::ROTL, MVT::i16, Expand);
572   setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
573   setOperationAction(ISD::ROTR, MVT::i16, Expand);
574   setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
575   setOperationAction(ISD::ROTL, MVT::i8, Expand);
576   setOperationAction(ISD::ROTR, MVT::i8, Expand);
577   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
578   setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
579   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
580   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
581 
582   // Indirect branch is not supported.
583   // This also disables Jump Table creation.
584   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
585   setOperationAction(ISD::BRIND, MVT::Other, Expand);
586 
587   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
588   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
589 
590   // We want to legalize constant related memmove and memcopy
591   // intrinsics.
592   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
593 
594   // Turn FP extload into load/fpextend
595   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
596   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
597   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
598   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
599   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
600   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
601   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
602   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
603   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
604   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
605   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
606   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
607   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
608   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
609   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
610   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
611   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
612   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
613   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
614   // Turn FP truncstore into trunc + store.
615   // FIXME: vector types should also be expanded
616   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
617   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
618   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
619   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
620   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
621 
622   // PTX does not support load / store predicate registers
623   setOperationAction(ISD::LOAD, MVT::i1, Custom);
624   setOperationAction(ISD::STORE, MVT::i1, Custom);
625 
626   for (MVT VT : MVT::integer_valuetypes()) {
627     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
628     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
629     setTruncStoreAction(VT, MVT::i1, Expand);
630   }
631 
632   // expand extload of vector of integers.
633   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
634                    MVT::v2i8, Expand);
635   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
636 
637   // This is legal in NVPTX
638   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
639   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
640   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
641   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
642 
643   // Lowering of DYNAMIC_STACKALLOC is unsupported.
644   // Custom lower to produce an error.
645   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
646   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
647 
648   // TRAP can be lowered to PTX trap
649   setOperationAction(ISD::TRAP, MVT::Other, Legal);
650 
651   // Register custom handling for vector loads/stores
652   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
653     if (IsPTXVectorType(VT)) {
654       setOperationAction(ISD::LOAD, VT, Custom);
655       setOperationAction(ISD::STORE, VT, Custom);
656       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
657     }
658   }
659 
660   // Support varargs.
661   setOperationAction(ISD::VASTART, MVT::Other, Custom);
662   setOperationAction(ISD::VAARG, MVT::Other, Custom);
663   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
664   setOperationAction(ISD::VAEND, MVT::Other, Expand);
665 
666   // Custom handling for i8 intrinsics
667   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
668 
669   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
670     setOperationAction(ISD::ABS,  Ty, Legal);
671     setOperationAction(ISD::SMIN, Ty, Legal);
672     setOperationAction(ISD::SMAX, Ty, Legal);
673     setOperationAction(ISD::UMIN, Ty, Legal);
674     setOperationAction(ISD::UMAX, Ty, Legal);
675 
676     setOperationAction(ISD::CTPOP, Ty, Legal);
677     setOperationAction(ISD::CTLZ, Ty, Legal);
678   }
679 
680   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
681   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
682   setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
683   setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
684   setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
685   setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
686   setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
687 
688   setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
689   setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
690   setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
691   setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
692   setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
693   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
694 
695   // Other arithmetic and logic ops are unsupported.
696   setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
697                       ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
698                       ISD::SINT_TO_FP, ISD::UINT_TO_FP},
699                      MVT::v2i16, Expand);
700 
701   setOperationAction(ISD::ADDC, MVT::i32, Legal);
702   setOperationAction(ISD::ADDE, MVT::i32, Legal);
703   setOperationAction(ISD::SUBC, MVT::i32, Legal);
704   setOperationAction(ISD::SUBE, MVT::i32, Legal);
705   if (STI.getPTXVersion() >= 43) {
706     setOperationAction(ISD::ADDC, MVT::i64, Legal);
707     setOperationAction(ISD::ADDE, MVT::i64, Legal);
708     setOperationAction(ISD::SUBC, MVT::i64, Legal);
709     setOperationAction(ISD::SUBE, MVT::i64, Legal);
710   }
711 
712   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
713   setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
714   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
715   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
716 
717   // PTX does not directly support SELP of i1, so promote to i32 first
718   setOperationAction(ISD::SELECT, MVT::i1, Custom);
719 
720   // PTX cannot multiply two i64s in a single instruction.
721   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
722   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
723 
724   // We have some custom DAG combine patterns for these nodes
725   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
726                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
727                        ISD::VSELECT});
728 
729   // setcc for f16x2 and bf16x2 needs special handling to prevent
730   // legalizer's attempt to scalarize it due to v2i1 not being legal.
731   if (STI.allowFP16Math() || STI.hasBF16Math())
732     setTargetDAGCombine(ISD::SETCC);
733 
734   // Promote fp16 arithmetic if fp16 hardware isn't available or the
735   // user passed --nvptx-no-fp16-math. The flag is useful because,
736   // although sm_53+ GPUs have some sort of FP16 support in
737   // hardware, only sm_53 and sm_60 have full implementation. Others
738   // only have token amount of hardware and are likely to run faster
739   // by using fp32 units instead.
740   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
741     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
742     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
743     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
744     // bf16 must be promoted to f32.
745     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
746     if (getOperationAction(Op, MVT::bf16) == Promote)
747       AddPromotedToType(Op, MVT::bf16, MVT::f32);
748   }
749 
750   // f16/f16x2 neg was introduced in PTX 60, SM_53.
751   const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
752                                         STI.getPTXVersion() >= 60 &&
753                                         STI.allowFP16Math();
754   for (const auto &VT : {MVT::f16, MVT::v2f16})
755     setOperationAction(ISD::FNEG, VT,
756                        IsFP16FP16x2NegAvailable ? Legal : Expand);
757 
758   setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
759   setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
760   // (would be) Library functions.
761 
762   // These map to conversion instructions for scalar FP types.
763   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
764                          ISD::FROUNDEVEN, ISD::FTRUNC}) {
765     setOperationAction(Op, MVT::f16, Legal);
766     setOperationAction(Op, MVT::f32, Legal);
767     setOperationAction(Op, MVT::f64, Legal);
768     setOperationAction(Op, MVT::v2f16, Expand);
769     setOperationAction(Op, MVT::v2bf16, Expand);
770     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
771     if (getOperationAction(Op, MVT::bf16) == Promote)
772       AddPromotedToType(Op, MVT::bf16, MVT::f32);
773   }
774 
775   // sm_80 only has conversions between f32 and bf16. Custom lower all other
776   // bf16 conversions.
777   if (STI.hasBF16Math() &&
778       (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
779     for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
780       setOperationAction(
781           {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
782           VT, Custom);
783     }
784   }
785 
786   setOperationAction(ISD::FROUND, MVT::f16, Promote);
787   setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
788   setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
789   setOperationAction(ISD::FROUND, MVT::f32, Custom);
790   setOperationAction(ISD::FROUND, MVT::f64, Custom);
791   setOperationAction(ISD::FROUND, MVT::bf16, Promote);
792   AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
793 
794   // 'Expand' implements FCOPYSIGN without calling an external library.
795   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
796   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
797   setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
798   setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
799   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
800   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
801 
802   // These map to corresponding instructions for f32/f64. f16 must be
803   // promoted to f32. v2f16 is expanded to f16, which is then promoted
804   // to f32.
805   for (const auto &Op :
806        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
807     setOperationAction(Op, MVT::f16, Promote);
808     setOperationAction(Op, MVT::f32, Legal);
809     setOperationAction(Op, MVT::f64, Legal);
810     setOperationAction(Op, MVT::v2f16, Expand);
811     setOperationAction(Op, MVT::v2bf16, Expand);
812     setOperationAction(Op, MVT::bf16, Promote);
813     AddPromotedToType(Op, MVT::bf16, MVT::f32);
814   }
815   for (const auto &Op : {ISD::FABS}) {
816     setOperationAction(Op, MVT::f16, Promote);
817     setOperationAction(Op, MVT::f32, Legal);
818     setOperationAction(Op, MVT::f64, Legal);
819     setOperationAction(Op, MVT::v2f16, Expand);
820     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
821     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
822     if (getOperationAction(Op, MVT::bf16) == Promote)
823       AddPromotedToType(Op, MVT::bf16, MVT::f32);
824   }
825 
826   // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
827   auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
828     bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
829     return IsAtLeastSm80 ? Legal : NotSm80Action;
830   };
831   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
832     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
833     setOperationAction(Op, MVT::f32, Legal);
834     setOperationAction(Op, MVT::f64, Legal);
835     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
836     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
837     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
838     if (getOperationAction(Op, MVT::bf16) == Promote)
839       AddPromotedToType(Op, MVT::bf16, MVT::f32);
840   }
841   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
842     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
843     setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
844     setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
845     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
846     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
847   }
848 
849   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
850   // No FPOW or FREM in PTX.
851 
852   // Now deduce the information based on the above mentioned
853   // actions
854   computeRegisterProperties(STI.getRegisterInfo());
855 
856   setMinCmpXchgSizeInBits(32);
857   setMaxAtomicSizeInBitsSupported(64);
858 }
859 
860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
861   switch ((NVPTXISD::NodeType)Opcode) {
862   case NVPTXISD::FIRST_NUMBER:
863     break;
864   case NVPTXISD::CALL:
865     return "NVPTXISD::CALL";
866   case NVPTXISD::RET_GLUE:
867     return "NVPTXISD::RET_GLUE";
868   case NVPTXISD::LOAD_PARAM:
869     return "NVPTXISD::LOAD_PARAM";
870   case NVPTXISD::Wrapper:
871     return "NVPTXISD::Wrapper";
872   case NVPTXISD::DeclareParam:
873     return "NVPTXISD::DeclareParam";
874   case NVPTXISD::DeclareScalarParam:
875     return "NVPTXISD::DeclareScalarParam";
876   case NVPTXISD::DeclareRet:
877     return "NVPTXISD::DeclareRet";
878   case NVPTXISD::DeclareScalarRet:
879     return "NVPTXISD::DeclareScalarRet";
880   case NVPTXISD::DeclareRetParam:
881     return "NVPTXISD::DeclareRetParam";
882   case NVPTXISD::PrintCall:
883     return "NVPTXISD::PrintCall";
884   case NVPTXISD::PrintConvergentCall:
885     return "NVPTXISD::PrintConvergentCall";
886   case NVPTXISD::PrintCallUni:
887     return "NVPTXISD::PrintCallUni";
888   case NVPTXISD::PrintConvergentCallUni:
889     return "NVPTXISD::PrintConvergentCallUni";
890   case NVPTXISD::LoadParam:
891     return "NVPTXISD::LoadParam";
892   case NVPTXISD::LoadParamV2:
893     return "NVPTXISD::LoadParamV2";
894   case NVPTXISD::LoadParamV4:
895     return "NVPTXISD::LoadParamV4";
896   case NVPTXISD::StoreParam:
897     return "NVPTXISD::StoreParam";
898   case NVPTXISD::StoreParamV2:
899     return "NVPTXISD::StoreParamV2";
900   case NVPTXISD::StoreParamV4:
901     return "NVPTXISD::StoreParamV4";
902   case NVPTXISD::StoreParamS32:
903     return "NVPTXISD::StoreParamS32";
904   case NVPTXISD::StoreParamU32:
905     return "NVPTXISD::StoreParamU32";
906   case NVPTXISD::CallArgBegin:
907     return "NVPTXISD::CallArgBegin";
908   case NVPTXISD::CallArg:
909     return "NVPTXISD::CallArg";
910   case NVPTXISD::LastCallArg:
911     return "NVPTXISD::LastCallArg";
912   case NVPTXISD::CallArgEnd:
913     return "NVPTXISD::CallArgEnd";
914   case NVPTXISD::CallVoid:
915     return "NVPTXISD::CallVoid";
916   case NVPTXISD::CallVal:
917     return "NVPTXISD::CallVal";
918   case NVPTXISD::CallSymbol:
919     return "NVPTXISD::CallSymbol";
920   case NVPTXISD::Prototype:
921     return "NVPTXISD::Prototype";
922   case NVPTXISD::MoveParam:
923     return "NVPTXISD::MoveParam";
924   case NVPTXISD::StoreRetval:
925     return "NVPTXISD::StoreRetval";
926   case NVPTXISD::StoreRetvalV2:
927     return "NVPTXISD::StoreRetvalV2";
928   case NVPTXISD::StoreRetvalV4:
929     return "NVPTXISD::StoreRetvalV4";
930   case NVPTXISD::PseudoUseParam:
931     return "NVPTXISD::PseudoUseParam";
932   case NVPTXISD::RETURN:
933     return "NVPTXISD::RETURN";
934   case NVPTXISD::CallSeqBegin:
935     return "NVPTXISD::CallSeqBegin";
936   case NVPTXISD::CallSeqEnd:
937     return "NVPTXISD::CallSeqEnd";
938   case NVPTXISD::CallPrototype:
939     return "NVPTXISD::CallPrototype";
940   case NVPTXISD::ProxyReg:
941     return "NVPTXISD::ProxyReg";
942   case NVPTXISD::LoadV2:
943     return "NVPTXISD::LoadV2";
944   case NVPTXISD::LoadV4:
945     return "NVPTXISD::LoadV4";
946   case NVPTXISD::LDGV2:
947     return "NVPTXISD::LDGV2";
948   case NVPTXISD::LDGV4:
949     return "NVPTXISD::LDGV4";
950   case NVPTXISD::LDUV2:
951     return "NVPTXISD::LDUV2";
952   case NVPTXISD::LDUV4:
953     return "NVPTXISD::LDUV4";
954   case NVPTXISD::StoreV2:
955     return "NVPTXISD::StoreV2";
956   case NVPTXISD::StoreV4:
957     return "NVPTXISD::StoreV4";
958   case NVPTXISD::FUN_SHFL_CLAMP:
959     return "NVPTXISD::FUN_SHFL_CLAMP";
960   case NVPTXISD::FUN_SHFR_CLAMP:
961     return "NVPTXISD::FUN_SHFR_CLAMP";
962   case NVPTXISD::IMAD:
963     return "NVPTXISD::IMAD";
964   case NVPTXISD::BFE:
965     return "NVPTXISD::BFE";
966   case NVPTXISD::BFI:
967     return "NVPTXISD::BFI";
968   case NVPTXISD::PRMT:
969     return "NVPTXISD::PRMT";
970   case NVPTXISD::SETP_F16X2:
971     return "NVPTXISD::SETP_F16X2";
972   case NVPTXISD::SETP_BF16X2:
973     return "NVPTXISD::SETP_BF16X2";
974   case NVPTXISD::Dummy:
975     return "NVPTXISD::Dummy";
976   case NVPTXISD::MUL_WIDE_SIGNED:
977     return "NVPTXISD::MUL_WIDE_SIGNED";
978   case NVPTXISD::MUL_WIDE_UNSIGNED:
979     return "NVPTXISD::MUL_WIDE_UNSIGNED";
980   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
981   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
982   case NVPTXISD::Tex1DFloatFloatLevel:
983     return "NVPTXISD::Tex1DFloatFloatLevel";
984   case NVPTXISD::Tex1DFloatFloatGrad:
985     return "NVPTXISD::Tex1DFloatFloatGrad";
986   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
987   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
988   case NVPTXISD::Tex1DS32FloatLevel:
989     return "NVPTXISD::Tex1DS32FloatLevel";
990   case NVPTXISD::Tex1DS32FloatGrad:
991     return "NVPTXISD::Tex1DS32FloatGrad";
992   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
993   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
994   case NVPTXISD::Tex1DU32FloatLevel:
995     return "NVPTXISD::Tex1DU32FloatLevel";
996   case NVPTXISD::Tex1DU32FloatGrad:
997     return "NVPTXISD::Tex1DU32FloatGrad";
998   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
999   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
1000   case NVPTXISD::Tex1DArrayFloatFloatLevel:
1001     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
1002   case NVPTXISD::Tex1DArrayFloatFloatGrad:
1003     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
1004   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
1005   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
1006   case NVPTXISD::Tex1DArrayS32FloatLevel:
1007     return "NVPTXISD::Tex1DArrayS32FloatLevel";
1008   case NVPTXISD::Tex1DArrayS32FloatGrad:
1009     return "NVPTXISD::Tex1DArrayS32FloatGrad";
1010   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
1011   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
1012   case NVPTXISD::Tex1DArrayU32FloatLevel:
1013     return "NVPTXISD::Tex1DArrayU32FloatLevel";
1014   case NVPTXISD::Tex1DArrayU32FloatGrad:
1015     return "NVPTXISD::Tex1DArrayU32FloatGrad";
1016   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
1017   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
1018   case NVPTXISD::Tex2DFloatFloatLevel:
1019     return "NVPTXISD::Tex2DFloatFloatLevel";
1020   case NVPTXISD::Tex2DFloatFloatGrad:
1021     return "NVPTXISD::Tex2DFloatFloatGrad";
1022   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
1023   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
1024   case NVPTXISD::Tex2DS32FloatLevel:
1025     return "NVPTXISD::Tex2DS32FloatLevel";
1026   case NVPTXISD::Tex2DS32FloatGrad:
1027     return "NVPTXISD::Tex2DS32FloatGrad";
1028   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
1029   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
1030   case NVPTXISD::Tex2DU32FloatLevel:
1031     return "NVPTXISD::Tex2DU32FloatLevel";
1032   case NVPTXISD::Tex2DU32FloatGrad:
1033     return "NVPTXISD::Tex2DU32FloatGrad";
1034   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
1035   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
1036   case NVPTXISD::Tex2DArrayFloatFloatLevel:
1037     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
1038   case NVPTXISD::Tex2DArrayFloatFloatGrad:
1039     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
1040   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
1041   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
1042   case NVPTXISD::Tex2DArrayS32FloatLevel:
1043     return "NVPTXISD::Tex2DArrayS32FloatLevel";
1044   case NVPTXISD::Tex2DArrayS32FloatGrad:
1045     return "NVPTXISD::Tex2DArrayS32FloatGrad";
1046   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
1047   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
1048   case NVPTXISD::Tex2DArrayU32FloatLevel:
1049     return "NVPTXISD::Tex2DArrayU32FloatLevel";
1050   case NVPTXISD::Tex2DArrayU32FloatGrad:
1051     return "NVPTXISD::Tex2DArrayU32FloatGrad";
1052   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
1053   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
1054   case NVPTXISD::Tex3DFloatFloatLevel:
1055     return "NVPTXISD::Tex3DFloatFloatLevel";
1056   case NVPTXISD::Tex3DFloatFloatGrad:
1057     return "NVPTXISD::Tex3DFloatFloatGrad";
1058   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
1059   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
1060   case NVPTXISD::Tex3DS32FloatLevel:
1061     return "NVPTXISD::Tex3DS32FloatLevel";
1062   case NVPTXISD::Tex3DS32FloatGrad:
1063     return "NVPTXISD::Tex3DS32FloatGrad";
1064   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
1065   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
1066   case NVPTXISD::Tex3DU32FloatLevel:
1067     return "NVPTXISD::Tex3DU32FloatLevel";
1068   case NVPTXISD::Tex3DU32FloatGrad:
1069     return "NVPTXISD::Tex3DU32FloatGrad";
1070   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
1071   case NVPTXISD::TexCubeFloatFloatLevel:
1072     return "NVPTXISD::TexCubeFloatFloatLevel";
1073   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
1074   case NVPTXISD::TexCubeS32FloatLevel:
1075     return "NVPTXISD::TexCubeS32FloatLevel";
1076   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
1077   case NVPTXISD::TexCubeU32FloatLevel:
1078     return "NVPTXISD::TexCubeU32FloatLevel";
1079   case NVPTXISD::TexCubeArrayFloatFloat:
1080     return "NVPTXISD::TexCubeArrayFloatFloat";
1081   case NVPTXISD::TexCubeArrayFloatFloatLevel:
1082     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
1083   case NVPTXISD::TexCubeArrayS32Float:
1084     return "NVPTXISD::TexCubeArrayS32Float";
1085   case NVPTXISD::TexCubeArrayS32FloatLevel:
1086     return "NVPTXISD::TexCubeArrayS32FloatLevel";
1087   case NVPTXISD::TexCubeArrayU32Float:
1088     return "NVPTXISD::TexCubeArrayU32Float";
1089   case NVPTXISD::TexCubeArrayU32FloatLevel:
1090     return "NVPTXISD::TexCubeArrayU32FloatLevel";
1091   case NVPTXISD::Tld4R2DFloatFloat:
1092     return "NVPTXISD::Tld4R2DFloatFloat";
1093   case NVPTXISD::Tld4G2DFloatFloat:
1094     return "NVPTXISD::Tld4G2DFloatFloat";
1095   case NVPTXISD::Tld4B2DFloatFloat:
1096     return "NVPTXISD::Tld4B2DFloatFloat";
1097   case NVPTXISD::Tld4A2DFloatFloat:
1098     return "NVPTXISD::Tld4A2DFloatFloat";
1099   case NVPTXISD::Tld4R2DS64Float:
1100     return "NVPTXISD::Tld4R2DS64Float";
1101   case NVPTXISD::Tld4G2DS64Float:
1102     return "NVPTXISD::Tld4G2DS64Float";
1103   case NVPTXISD::Tld4B2DS64Float:
1104     return "NVPTXISD::Tld4B2DS64Float";
1105   case NVPTXISD::Tld4A2DS64Float:
1106     return "NVPTXISD::Tld4A2DS64Float";
1107   case NVPTXISD::Tld4R2DU64Float:
1108     return "NVPTXISD::Tld4R2DU64Float";
1109   case NVPTXISD::Tld4G2DU64Float:
1110     return "NVPTXISD::Tld4G2DU64Float";
1111   case NVPTXISD::Tld4B2DU64Float:
1112     return "NVPTXISD::Tld4B2DU64Float";
1113   case NVPTXISD::Tld4A2DU64Float:
1114     return "NVPTXISD::Tld4A2DU64Float";
1115 
1116   case NVPTXISD::TexUnified1DFloatS32:
1117     return "NVPTXISD::TexUnified1DFloatS32";
1118   case NVPTXISD::TexUnified1DFloatFloat:
1119     return "NVPTXISD::TexUnified1DFloatFloat";
1120   case NVPTXISD::TexUnified1DFloatFloatLevel:
1121     return "NVPTXISD::TexUnified1DFloatFloatLevel";
1122   case NVPTXISD::TexUnified1DFloatFloatGrad:
1123     return "NVPTXISD::TexUnified1DFloatFloatGrad";
1124   case NVPTXISD::TexUnified1DS32S32:
1125     return "NVPTXISD::TexUnified1DS32S32";
1126   case NVPTXISD::TexUnified1DS32Float:
1127     return "NVPTXISD::TexUnified1DS32Float";
1128   case NVPTXISD::TexUnified1DS32FloatLevel:
1129     return "NVPTXISD::TexUnified1DS32FloatLevel";
1130   case NVPTXISD::TexUnified1DS32FloatGrad:
1131     return "NVPTXISD::TexUnified1DS32FloatGrad";
1132   case NVPTXISD::TexUnified1DU32S32:
1133     return "NVPTXISD::TexUnified1DU32S32";
1134   case NVPTXISD::TexUnified1DU32Float:
1135     return "NVPTXISD::TexUnified1DU32Float";
1136   case NVPTXISD::TexUnified1DU32FloatLevel:
1137     return "NVPTXISD::TexUnified1DU32FloatLevel";
1138   case NVPTXISD::TexUnified1DU32FloatGrad:
1139     return "NVPTXISD::TexUnified1DU32FloatGrad";
1140   case NVPTXISD::TexUnified1DArrayFloatS32:
1141     return "NVPTXISD::TexUnified1DArrayFloatS32";
1142   case NVPTXISD::TexUnified1DArrayFloatFloat:
1143     return "NVPTXISD::TexUnified1DArrayFloatFloat";
1144   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
1145     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
1146   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
1147     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
1148   case NVPTXISD::TexUnified1DArrayS32S32:
1149     return "NVPTXISD::TexUnified1DArrayS32S32";
1150   case NVPTXISD::TexUnified1DArrayS32Float:
1151     return "NVPTXISD::TexUnified1DArrayS32Float";
1152   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
1153     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
1154   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
1155     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
1156   case NVPTXISD::TexUnified1DArrayU32S32:
1157     return "NVPTXISD::TexUnified1DArrayU32S32";
1158   case NVPTXISD::TexUnified1DArrayU32Float:
1159     return "NVPTXISD::TexUnified1DArrayU32Float";
1160   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
1161     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
1162   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
1163     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
1164   case NVPTXISD::TexUnified2DFloatS32:
1165     return "NVPTXISD::TexUnified2DFloatS32";
1166   case NVPTXISD::TexUnified2DFloatFloat:
1167     return "NVPTXISD::TexUnified2DFloatFloat";
1168   case NVPTXISD::TexUnified2DFloatFloatLevel:
1169     return "NVPTXISD::TexUnified2DFloatFloatLevel";
1170   case NVPTXISD::TexUnified2DFloatFloatGrad:
1171     return "NVPTXISD::TexUnified2DFloatFloatGrad";
1172   case NVPTXISD::TexUnified2DS32S32:
1173     return "NVPTXISD::TexUnified2DS32S32";
1174   case NVPTXISD::TexUnified2DS32Float:
1175     return "NVPTXISD::TexUnified2DS32Float";
1176   case NVPTXISD::TexUnified2DS32FloatLevel:
1177     return "NVPTXISD::TexUnified2DS32FloatLevel";
1178   case NVPTXISD::TexUnified2DS32FloatGrad:
1179     return "NVPTXISD::TexUnified2DS32FloatGrad";
1180   case NVPTXISD::TexUnified2DU32S32:
1181     return "NVPTXISD::TexUnified2DU32S32";
1182   case NVPTXISD::TexUnified2DU32Float:
1183     return "NVPTXISD::TexUnified2DU32Float";
1184   case NVPTXISD::TexUnified2DU32FloatLevel:
1185     return "NVPTXISD::TexUnified2DU32FloatLevel";
1186   case NVPTXISD::TexUnified2DU32FloatGrad:
1187     return "NVPTXISD::TexUnified2DU32FloatGrad";
1188   case NVPTXISD::TexUnified2DArrayFloatS32:
1189     return "NVPTXISD::TexUnified2DArrayFloatS32";
1190   case NVPTXISD::TexUnified2DArrayFloatFloat:
1191     return "NVPTXISD::TexUnified2DArrayFloatFloat";
1192   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
1193     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
1194   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
1195     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
1196   case NVPTXISD::TexUnified2DArrayS32S32:
1197     return "NVPTXISD::TexUnified2DArrayS32S32";
1198   case NVPTXISD::TexUnified2DArrayS32Float:
1199     return "NVPTXISD::TexUnified2DArrayS32Float";
1200   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
1201     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
1202   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
1203     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
1204   case NVPTXISD::TexUnified2DArrayU32S32:
1205     return "NVPTXISD::TexUnified2DArrayU32S32";
1206   case NVPTXISD::TexUnified2DArrayU32Float:
1207     return "NVPTXISD::TexUnified2DArrayU32Float";
1208   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
1209     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
1210   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
1211     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
1212   case NVPTXISD::TexUnified3DFloatS32:
1213     return "NVPTXISD::TexUnified3DFloatS32";
1214   case NVPTXISD::TexUnified3DFloatFloat:
1215     return "NVPTXISD::TexUnified3DFloatFloat";
1216   case NVPTXISD::TexUnified3DFloatFloatLevel:
1217     return "NVPTXISD::TexUnified3DFloatFloatLevel";
1218   case NVPTXISD::TexUnified3DFloatFloatGrad:
1219     return "NVPTXISD::TexUnified3DFloatFloatGrad";
1220   case NVPTXISD::TexUnified3DS32S32:
1221     return "NVPTXISD::TexUnified3DS32S32";
1222   case NVPTXISD::TexUnified3DS32Float:
1223     return "NVPTXISD::TexUnified3DS32Float";
1224   case NVPTXISD::TexUnified3DS32FloatLevel:
1225     return "NVPTXISD::TexUnified3DS32FloatLevel";
1226   case NVPTXISD::TexUnified3DS32FloatGrad:
1227     return "NVPTXISD::TexUnified3DS32FloatGrad";
1228   case NVPTXISD::TexUnified3DU32S32:
1229     return "NVPTXISD::TexUnified3DU32S32";
1230   case NVPTXISD::TexUnified3DU32Float:
1231     return "NVPTXISD::TexUnified3DU32Float";
1232   case NVPTXISD::TexUnified3DU32FloatLevel:
1233     return "NVPTXISD::TexUnified3DU32FloatLevel";
1234   case NVPTXISD::TexUnified3DU32FloatGrad:
1235     return "NVPTXISD::TexUnified3DU32FloatGrad";
1236   case NVPTXISD::TexUnifiedCubeFloatFloat:
1237     return "NVPTXISD::TexUnifiedCubeFloatFloat";
1238   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
1239     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1240   case NVPTXISD::TexUnifiedCubeS32Float:
1241     return "NVPTXISD::TexUnifiedCubeS32Float";
1242   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
1243     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1244   case NVPTXISD::TexUnifiedCubeU32Float:
1245     return "NVPTXISD::TexUnifiedCubeU32Float";
1246   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
1247     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1248   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
1249     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1250   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
1251     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1252   case NVPTXISD::TexUnifiedCubeArrayS32Float:
1253     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1254   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
1255     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1256   case NVPTXISD::TexUnifiedCubeArrayU32Float:
1257     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1258   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
1259     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1260   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
1261     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1262   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
1263     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1264   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
1265     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1266   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
1267     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1268   case NVPTXISD::Tld4UnifiedR2DS64Float:
1269     return "NVPTXISD::Tld4UnifiedR2DS64Float";
1270   case NVPTXISD::Tld4UnifiedG2DS64Float:
1271     return "NVPTXISD::Tld4UnifiedG2DS64Float";
1272   case NVPTXISD::Tld4UnifiedB2DS64Float:
1273     return "NVPTXISD::Tld4UnifiedB2DS64Float";
1274   case NVPTXISD::Tld4UnifiedA2DS64Float:
1275     return "NVPTXISD::Tld4UnifiedA2DS64Float";
1276   case NVPTXISD::Tld4UnifiedR2DU64Float:
1277     return "NVPTXISD::Tld4UnifiedR2DU64Float";
1278   case NVPTXISD::Tld4UnifiedG2DU64Float:
1279     return "NVPTXISD::Tld4UnifiedG2DU64Float";
1280   case NVPTXISD::Tld4UnifiedB2DU64Float:
1281     return "NVPTXISD::Tld4UnifiedB2DU64Float";
1282   case NVPTXISD::Tld4UnifiedA2DU64Float:
1283     return "NVPTXISD::Tld4UnifiedA2DU64Float";
1284 
1285   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1286   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1287   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1288   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1289   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1290   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1291   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1292   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1293   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1294   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1295   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1296 
1297   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1298   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1299   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1300   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1301   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1302   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1303   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1304   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1305   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1306   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1307   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1308 
1309   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1310   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1311   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1312   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1313   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1314   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1315   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1316   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1317   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1318   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1319   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1320 
1321   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1322   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1323   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1324   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1325   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1326   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1327   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1328   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1329   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1330   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1331   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1332 
1333   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1334   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1335   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1336   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1337   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1338   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1339   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1340   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1341   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1342   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1343   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1344 
1345   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1346   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1347   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1348   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1349   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1350   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1351   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1352   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1353   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1354   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1355   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1356 
1357   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1358   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1359   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1360   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1361   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1362   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1363   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1364   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1365   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1366   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1367   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1368 
1369   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1370   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1371   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1372   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1373   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1374   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1375   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1376   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1377   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1378   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1379   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1380 
1381   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1382   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1383   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1384   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1385   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1386   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1387   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1388   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1389   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1390   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1391   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1392 
1393   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1394   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1395   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1396   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1397   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1398   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1399   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1400   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1401   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1402   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1403   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1404 
1405   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1406   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1407   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1408   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1409   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1410   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1411   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1412   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1413   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1414   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1415   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1416 
1417   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1418   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1419   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1420   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1421   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1422   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1423   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1424   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1425   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1426   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1427   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1428 
1429   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1430   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1431   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1432   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1433   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1434   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1435   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1436   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1437   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1438   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1439   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1440 
1441   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1442   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1443   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1444   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1445   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1446   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1447   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1448   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1449   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1450   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1451   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1452 
1453   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1454   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1455   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1456   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1457   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1458   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1459   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1460   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1461   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1462   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1463   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1464   }
1465   return nullptr;
1466 }
1467 
1468 TargetLoweringBase::LegalizeTypeAction
1469 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1470   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1471       VT.getScalarType() == MVT::i1)
1472     return TypeSplitVector;
1473   if (Isv2x16VT(VT))
1474     return TypeLegal;
1475   return TargetLoweringBase::getPreferredVectorAction(VT);
1476 }
1477 
1478 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1479                                              int Enabled, int &ExtraSteps,
1480                                              bool &UseOneConst,
1481                                              bool Reciprocal) const {
1482   if (!(Enabled == ReciprocalEstimate::Enabled ||
1483         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1484     return SDValue();
1485 
1486   if (ExtraSteps == ReciprocalEstimate::Unspecified)
1487     ExtraSteps = 0;
1488 
1489   SDLoc DL(Operand);
1490   EVT VT = Operand.getValueType();
1491   bool Ftz = useF32FTZ(DAG.getMachineFunction());
1492 
1493   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1494     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1495                        DAG.getConstant(IID, DL, MVT::i32), Operand);
1496   };
1497 
1498   // The sqrt and rsqrt refinement processes assume we always start out with an
1499   // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1500   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1501   // any refinement, we must return a regular sqrt.
1502   if (Reciprocal || ExtraSteps > 0) {
1503     if (VT == MVT::f32)
1504       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1505                                    : Intrinsic::nvvm_rsqrt_approx_f);
1506     else if (VT == MVT::f64)
1507       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1508     else
1509       return SDValue();
1510   } else {
1511     if (VT == MVT::f32)
1512       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1513                                    : Intrinsic::nvvm_sqrt_approx_f);
1514     else {
1515       // There's no sqrt.approx.f64 instruction, so we emit
1516       // reciprocal(rsqrt(x)).  This is faster than
1517       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1518       // x * rsqrt(x).)
1519       return DAG.getNode(
1520           ISD::INTRINSIC_WO_CHAIN, DL, VT,
1521           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1522           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1523     }
1524   }
1525 }
1526 
1527 SDValue
1528 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1529   SDLoc dl(Op);
1530   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1531   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1532   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1533   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1534 }
1535 
1536 static bool IsTypePassedAsArray(const Type *Ty) {
1537   return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1538          Ty->isHalfTy() || Ty->isBFloatTy();
1539 }
1540 
1541 std::string NVPTXTargetLowering::getPrototype(
1542     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1543     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1544     std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1545     const CallBase &CB, unsigned UniqueCallSite) const {
1546   auto PtrVT = getPointerTy(DL);
1547 
1548   bool isABI = (STI.getSmVersion() >= 20);
1549   assert(isABI && "Non-ABI compilation is not supported");
1550   if (!isABI)
1551     return "";
1552 
1553   std::string Prototype;
1554   raw_string_ostream O(Prototype);
1555   O << "prototype_" << UniqueCallSite << " : .callprototype ";
1556 
1557   if (retTy->getTypeID() == Type::VoidTyID) {
1558     O << "()";
1559   } else {
1560     O << "(";
1561     if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1562         !IsTypePassedAsArray(retTy)) {
1563       unsigned size = 0;
1564       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1565         size = ITy->getBitWidth();
1566       } else {
1567         assert(retTy->isFloatingPointTy() &&
1568                "Floating point type expected here");
1569         size = retTy->getPrimitiveSizeInBits();
1570       }
1571       // PTX ABI requires all scalar return values to be at least 32
1572       // bits in size.  fp16 normally uses .b16 as its storage type in
1573       // PTX, so its size must be adjusted here, too.
1574       size = promoteScalarArgumentSize(size);
1575 
1576       O << ".param .b" << size << " _";
1577     } else if (isa<PointerType>(retTy)) {
1578       O << ".param .b" << PtrVT.getSizeInBits() << " _";
1579     } else if (IsTypePassedAsArray(retTy)) {
1580       O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1581         << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1582     } else {
1583       llvm_unreachable("Unknown return type");
1584     }
1585     O << ") ";
1586   }
1587   O << "_ (";
1588 
1589   bool first = true;
1590 
1591   const Function *F = CB.getFunction();
1592   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1593   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1594     Type *Ty = Args[i].Ty;
1595     if (!first) {
1596       O << ", ";
1597     }
1598     first = false;
1599 
1600     if (!Outs[OIdx].Flags.isByVal()) {
1601       if (IsTypePassedAsArray(Ty)) {
1602         unsigned ParamAlign = 0;
1603         const CallInst *CallI = cast<CallInst>(&CB);
1604         // +1 because index 0 is reserved for return type alignment
1605         if (!getAlign(*CallI, i + 1, ParamAlign))
1606           ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1607         O << ".param .align " << ParamAlign << " .b8 ";
1608         O << "_";
1609         O << "[" << DL.getTypeAllocSize(Ty) << "]";
1610         // update the index for Outs
1611         SmallVector<EVT, 16> vtparts;
1612         ComputeValueVTs(*this, DL, Ty, vtparts);
1613         if (unsigned len = vtparts.size())
1614           OIdx += len - 1;
1615         continue;
1616       }
1617       // i8 types in IR will be i16 types in SDAG
1618       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1619               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1620              "type mismatch between callee prototype and arguments");
1621       // scalar type
1622       unsigned sz = 0;
1623       if (isa<IntegerType>(Ty)) {
1624         sz = cast<IntegerType>(Ty)->getBitWidth();
1625         sz = promoteScalarArgumentSize(sz);
1626       } else if (isa<PointerType>(Ty)) {
1627         sz = PtrVT.getSizeInBits();
1628       } else {
1629         sz = Ty->getPrimitiveSizeInBits();
1630       }
1631       O << ".param .b" << sz << " ";
1632       O << "_";
1633       continue;
1634     }
1635 
1636     Type *ETy = Args[i].IndirectType;
1637     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1638     Align ParamByValAlign =
1639         getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1640 
1641     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1642     O << "_";
1643     O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1644   }
1645 
1646   if (VAInfo)
1647     O << (first ? "" : ",") << " .param .align " << VAInfo->second
1648       << " .b8 _[]\n";
1649   O << ")";
1650   if (shouldEmitPTXNoReturn(&CB, *nvTM))
1651     O << " .noreturn";
1652   O << ";";
1653 
1654   return Prototype;
1655 }
1656 
1657 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1658                                                 const CallBase *CB, Type *Ty,
1659                                                 unsigned Idx,
1660                                                 const DataLayout &DL) const {
1661   if (!CB) {
1662     // CallSite is zero, fallback to ABI type alignment
1663     return DL.getABITypeAlign(Ty);
1664   }
1665 
1666   unsigned Alignment = 0;
1667   const Function *DirectCallee = CB->getCalledFunction();
1668 
1669   if (!DirectCallee) {
1670     // We don't have a direct function symbol, but that may be because of
1671     // constant cast instructions in the call.
1672 
1673     // With bitcast'd call targets, the instruction will be the call
1674     if (const auto *CI = dyn_cast<CallInst>(CB)) {
1675       // Check if we have call alignment metadata
1676       if (getAlign(*CI, Idx, Alignment))
1677         return Align(Alignment);
1678     }
1679     DirectCallee = getMaybeBitcastedCallee(CB);
1680   }
1681 
1682   // Check for function alignment information if we found that the
1683   // ultimate target is a Function
1684   if (DirectCallee) {
1685     if (getAlign(*DirectCallee, Idx, Alignment))
1686       return Align(Alignment);
1687     // If alignment information is not available, fall back to the
1688     // default function param optimized type alignment
1689     return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1690   }
1691 
1692   // Call is indirect, fall back to the ABI type alignment
1693   return DL.getABITypeAlign(Ty);
1694 }
1695 
1696 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1697                                        SmallVectorImpl<SDValue> &InVals) const {
1698 
1699   if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1700     report_fatal_error(
1701         "Support for variadic functions (unsized array parameter) introduced "
1702         "in PTX ISA version 6.0 and requires target sm_30.");
1703 
1704   SelectionDAG &DAG = CLI.DAG;
1705   SDLoc dl = CLI.DL;
1706   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1707   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1708   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1709   SDValue Chain = CLI.Chain;
1710   SDValue Callee = CLI.Callee;
1711   bool &isTailCall = CLI.IsTailCall;
1712   ArgListTy &Args = CLI.getArgs();
1713   Type *RetTy = CLI.RetTy;
1714   const CallBase *CB = CLI.CB;
1715   const DataLayout &DL = DAG.getDataLayout();
1716 
1717   bool isABI = (STI.getSmVersion() >= 20);
1718   assert(isABI && "Non-ABI compilation is not supported");
1719   if (!isABI)
1720     return Chain;
1721 
1722   // Variadic arguments.
1723   //
1724   // Normally, for each argument, we declare a param scalar or a param
1725   // byte array in the .param space, and store the argument value to that
1726   // param scalar or array starting at offset 0.
1727   //
1728   // In the case of the first variadic argument, we declare a vararg byte array
1729   // with size 0. The exact size of this array isn't known at this point, so
1730   // it'll be patched later. All the variadic arguments will be stored to this
1731   // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1732   // initially set to 0, so it can be used for non-variadic arguments (which use
1733   // 0 offset) to simplify the code.
1734   //
1735   // After all vararg is processed, 'VAOffset' holds the size of the
1736   // vararg byte array.
1737 
1738   SDValue VADeclareParam;                 // vararg byte array
1739   unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1740   unsigned VAOffset = 0;                  // current offset in the param array
1741 
1742   unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1743   SDValue TempChain = Chain;
1744   Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1745   SDValue InGlue = Chain.getValue(1);
1746 
1747   unsigned ParamCount = 0;
1748   // Args.size() and Outs.size() need not match.
1749   // Outs.size() will be larger
1750   //   * if there is an aggregate argument with multiple fields (each field
1751   //     showing up separately in Outs)
1752   //   * if there is a vector argument with more than typical vector-length
1753   //     elements (generally if more than 4) where each vector element is
1754   //     individually present in Outs.
1755   // So a different index should be used for indexing into Outs/OutVals.
1756   // See similar issue in LowerFormalArguments.
1757   unsigned OIdx = 0;
1758   // Declare the .params or .reg need to pass values
1759   // to the function
1760   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1761     EVT VT = Outs[OIdx].VT;
1762     Type *Ty = Args[i].Ty;
1763     bool IsVAArg = (i >= CLI.NumFixedArgs);
1764     bool IsByVal = Outs[OIdx].Flags.isByVal();
1765 
1766     SmallVector<EVT, 16> VTs;
1767     SmallVector<uint64_t, 16> Offsets;
1768 
1769     assert((!IsByVal || Args[i].IndirectType) &&
1770            "byval arg must have indirect type");
1771     Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1772     ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1773 
1774     Align ArgAlign;
1775     if (IsByVal) {
1776       // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1777       // so we don't need to worry whether it's naturally aligned or not.
1778       // See TargetLowering::LowerCallTo().
1779       Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1780       ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1781                                             InitialAlign, DL);
1782       if (IsVAArg)
1783         VAOffset = alignTo(VAOffset, ArgAlign);
1784     } else {
1785       ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
1786     }
1787 
1788     unsigned TypeSize =
1789         (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1790     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1791 
1792     bool NeedAlign; // Does argument declaration specify alignment?
1793     bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1794     if (IsVAArg) {
1795       if (ParamCount == FirstVAArg) {
1796         SDValue DeclareParamOps[] = {
1797             Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1798             DAG.getConstant(ParamCount, dl, MVT::i32),
1799             DAG.getConstant(1, dl, MVT::i32), InGlue};
1800         VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1801                                              DeclareParamVTs, DeclareParamOps);
1802       }
1803       NeedAlign = PassAsArray;
1804     } else if (PassAsArray) {
1805       // declare .param .align <align> .b8 .param<n>[<size>];
1806       SDValue DeclareParamOps[] = {
1807           Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1808           DAG.getConstant(ParamCount, dl, MVT::i32),
1809           DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1810       Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1811                           DeclareParamOps);
1812       NeedAlign = true;
1813     } else {
1814       // declare .param .b<size> .param<n>;
1815       if (VT.isInteger() || VT.isFloatingPoint()) {
1816         // PTX ABI requires integral types to be at least 32 bits in
1817         // size. FP16 is loaded/stored using i16, so it's handled
1818         // here as well.
1819         TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1820       }
1821       SDValue DeclareScalarParamOps[] = {
1822           Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1823           DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1824           DAG.getConstant(0, dl, MVT::i32), InGlue};
1825       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1826                           DeclareScalarParamOps);
1827       NeedAlign = false;
1828     }
1829     InGlue = Chain.getValue(1);
1830 
1831     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1832     // than 32-bits are sign extended or zero extended, depending on
1833     // whether they are signed or unsigned types. This case applies
1834     // only to scalar parameters and not to aggregate values.
1835     bool ExtendIntegerParam =
1836         Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1837 
1838     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1839     SmallVector<SDValue, 6> StoreOperands;
1840     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1841       EVT EltVT = VTs[j];
1842       int CurOffset = Offsets[j];
1843       MaybeAlign PartAlign;
1844       if (NeedAlign)
1845         PartAlign = commonAlignment(ArgAlign, CurOffset);
1846 
1847       // New store.
1848       if (VectorInfo[j] & PVF_FIRST) {
1849         assert(StoreOperands.empty() && "Unfinished preceding store.");
1850         StoreOperands.push_back(Chain);
1851         StoreOperands.push_back(
1852             DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1853         StoreOperands.push_back(DAG.getConstant(
1854             IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1855             dl, MVT::i32));
1856       }
1857 
1858       SDValue StVal = OutVals[OIdx];
1859 
1860       MVT PromotedVT;
1861       if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1862         EltVT = EVT(PromotedVT);
1863       }
1864       if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1865         llvm::ISD::NodeType Ext =
1866             Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1867         StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1868       }
1869 
1870       if (IsByVal) {
1871         auto PtrVT = getPointerTy(DL);
1872         SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1873                                       DAG.getConstant(CurOffset, dl, PtrVT));
1874         StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1875                             PartAlign);
1876       } else if (ExtendIntegerParam) {
1877         assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1878         // zext/sext to i32
1879         StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1880                                                       : ISD::ZERO_EXTEND,
1881                             dl, MVT::i32, StVal);
1882       }
1883 
1884       if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1885         // Use 16-bit registers for small stores as it's the
1886         // smallest general purpose register size supported by NVPTX.
1887         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1888       }
1889 
1890       // Record the value to store.
1891       StoreOperands.push_back(StVal);
1892 
1893       if (VectorInfo[j] & PVF_LAST) {
1894         unsigned NumElts = StoreOperands.size() - 3;
1895         NVPTXISD::NodeType Op;
1896         switch (NumElts) {
1897         case 1:
1898           Op = NVPTXISD::StoreParam;
1899           break;
1900         case 2:
1901           Op = NVPTXISD::StoreParamV2;
1902           break;
1903         case 4:
1904           Op = NVPTXISD::StoreParamV4;
1905           break;
1906         default:
1907           llvm_unreachable("Invalid vector info.");
1908         }
1909 
1910         StoreOperands.push_back(InGlue);
1911 
1912         // Adjust type of the store op if we've extended the scalar
1913         // return value.
1914         EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1915 
1916         Chain = DAG.getMemIntrinsicNode(
1917             Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1918             TheStoreType, MachinePointerInfo(), PartAlign,
1919             MachineMemOperand::MOStore);
1920         InGlue = Chain.getValue(1);
1921 
1922         // Cleanup.
1923         StoreOperands.clear();
1924 
1925         // TODO: We may need to support vector types that can be passed
1926         // as scalars in variadic arguments.
1927         if (!IsByVal && IsVAArg) {
1928           assert(NumElts == 1 &&
1929                  "Vectorization is expected to be disabled for variadics.");
1930           VAOffset += DL.getTypeAllocSize(
1931               TheStoreType.getTypeForEVT(*DAG.getContext()));
1932         }
1933       }
1934       if (!IsByVal)
1935         ++OIdx;
1936     }
1937     assert(StoreOperands.empty() && "Unfinished parameter store.");
1938     if (!IsByVal && VTs.size() > 0)
1939       --OIdx;
1940     ++ParamCount;
1941     if (IsByVal && IsVAArg)
1942       VAOffset += TypeSize;
1943   }
1944 
1945   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1946   MaybeAlign retAlignment = std::nullopt;
1947 
1948   // Handle Result
1949   if (Ins.size() > 0) {
1950     SmallVector<EVT, 16> resvtparts;
1951     ComputeValueVTs(*this, DL, RetTy, resvtparts);
1952 
1953     // Declare
1954     //  .param .align N .b8 retval0[<size-in-bytes>], or
1955     //  .param .b<size-in-bits> retval0
1956     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1957     if (!IsTypePassedAsArray(RetTy)) {
1958       resultsz = promoteScalarArgumentSize(resultsz);
1959       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1960       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1961                                   DAG.getConstant(resultsz, dl, MVT::i32),
1962                                   DAG.getConstant(0, dl, MVT::i32), InGlue };
1963       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1964                           DeclareRetOps);
1965       InGlue = Chain.getValue(1);
1966     } else {
1967       retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
1968       assert(retAlignment && "retAlignment is guaranteed to be set");
1969       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1970       SDValue DeclareRetOps[] = {
1971           Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1972           DAG.getConstant(resultsz / 8, dl, MVT::i32),
1973           DAG.getConstant(0, dl, MVT::i32), InGlue};
1974       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1975                           DeclareRetOps);
1976       InGlue = Chain.getValue(1);
1977     }
1978   }
1979 
1980   bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1981   // Set the size of the vararg param byte array if the callee is a variadic
1982   // function and the variadic part is not empty.
1983   if (HasVAArgs) {
1984     SDValue DeclareParamOps[] = {
1985         VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1986         VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1987         VADeclareParam.getOperand(4)};
1988     DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1989                     VADeclareParam->getVTList(), DeclareParamOps);
1990   }
1991 
1992   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1993   // between them we must rely on the call site value which is valid for
1994   // indirect calls but is always null for libcalls.
1995   bool isIndirectCall = !Func && CB;
1996 
1997   if (isa<ExternalSymbolSDNode>(Callee)) {
1998     Function* CalleeFunc = nullptr;
1999 
2000     // Try to find the callee in the current module.
2001     Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
2002     assert(CalleeFunc != nullptr && "Libcall callee must be set.");
2003 
2004     // Set the "libcall callee" attribute to indicate that the function
2005     // must always have a declaration.
2006     CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
2007   }
2008 
2009   if (isIndirectCall) {
2010     // This is indirect function call case : PTX requires a prototype of the
2011     // form
2012     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
2013     // to be emitted, and the label has to used as the last arg of call
2014     // instruction.
2015     // The prototype is embedded in a string and put as the operand for a
2016     // CallPrototype SDNode which will print out to the value of the string.
2017     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2018     std::string Proto = getPrototype(
2019         DL, RetTy, Args, Outs, retAlignment,
2020         HasVAArgs
2021             ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
2022                   CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
2023             : std::nullopt,
2024         *CB, UniqueCallSite);
2025     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
2026     SDValue ProtoOps[] = {
2027         Chain,
2028         DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
2029         InGlue,
2030     };
2031     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
2032     InGlue = Chain.getValue(1);
2033   }
2034   // Op to just print "call"
2035   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2036   SDValue PrintCallOps[] = {
2037     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2038   };
2039   // We model convergent calls as separate opcodes.
2040   unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
2041   if (CLI.IsConvergent)
2042     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2043                                               : NVPTXISD::PrintConvergentCall;
2044   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2045   InGlue = Chain.getValue(1);
2046 
2047   // Ops to print out the function name
2048   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2049   SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2050   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2051   InGlue = Chain.getValue(1);
2052 
2053   // Ops to print out the param list
2054   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2055   SDValue CallArgBeginOps[] = { Chain, InGlue };
2056   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2057                       CallArgBeginOps);
2058   InGlue = Chain.getValue(1);
2059 
2060   for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2061        ++i) {
2062     unsigned opcode;
2063     if (i == (e - 1))
2064       opcode = NVPTXISD::LastCallArg;
2065     else
2066       opcode = NVPTXISD::CallArg;
2067     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2068     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2069                              DAG.getConstant(i, dl, MVT::i32), InGlue };
2070     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2071     InGlue = Chain.getValue(1);
2072   }
2073   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2074   SDValue CallArgEndOps[] = { Chain,
2075                               DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2076                               InGlue };
2077   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2078   InGlue = Chain.getValue(1);
2079 
2080   if (isIndirectCall) {
2081     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2082     SDValue PrototypeOps[] = {
2083         Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2084     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2085     InGlue = Chain.getValue(1);
2086   }
2087 
2088   SmallVector<SDValue, 16> ProxyRegOps;
2089   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2090 
2091   // Generate loads from param memory/moves from registers for result
2092   if (Ins.size() > 0) {
2093     SmallVector<EVT, 16> VTs;
2094     SmallVector<uint64_t, 16> Offsets;
2095     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2096     assert(VTs.size() == Ins.size() && "Bad value decomposition");
2097 
2098     Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
2099     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2100 
2101     SmallVector<EVT, 6> LoadVTs;
2102     int VecIdx = -1; // Index of the first element of the vector.
2103 
2104     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2105     // 32-bits are sign extended or zero extended, depending on whether
2106     // they are signed or unsigned types.
2107     bool ExtendIntegerRetVal =
2108         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2109 
2110     for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2111       bool needTruncate = false;
2112       EVT TheLoadType = VTs[i];
2113       EVT EltType = Ins[i].VT;
2114       Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2115       MVT PromotedVT;
2116 
2117       if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2118         TheLoadType = EVT(PromotedVT);
2119         EltType = EVT(PromotedVT);
2120         needTruncate = true;
2121       }
2122 
2123       if (ExtendIntegerRetVal) {
2124         TheLoadType = MVT::i32;
2125         EltType = MVT::i32;
2126         needTruncate = true;
2127       } else if (TheLoadType.getSizeInBits() < 16) {
2128         if (VTs[i].isInteger())
2129           needTruncate = true;
2130         EltType = MVT::i16;
2131       }
2132 
2133       // Record index of the very first element of the vector.
2134       if (VectorInfo[i] & PVF_FIRST) {
2135         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2136         VecIdx = i;
2137       }
2138 
2139       LoadVTs.push_back(EltType);
2140 
2141       if (VectorInfo[i] & PVF_LAST) {
2142         unsigned NumElts = LoadVTs.size();
2143         LoadVTs.push_back(MVT::Other);
2144         LoadVTs.push_back(MVT::Glue);
2145         NVPTXISD::NodeType Op;
2146         switch (NumElts) {
2147         case 1:
2148           Op = NVPTXISD::LoadParam;
2149           break;
2150         case 2:
2151           Op = NVPTXISD::LoadParamV2;
2152           break;
2153         case 4:
2154           Op = NVPTXISD::LoadParamV4;
2155           break;
2156         default:
2157           llvm_unreachable("Invalid vector info.");
2158         }
2159 
2160         SDValue LoadOperands[] = {
2161             Chain, DAG.getConstant(1, dl, MVT::i32),
2162             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2163         SDValue RetVal = DAG.getMemIntrinsicNode(
2164             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2165             MachinePointerInfo(), EltAlign,
2166             MachineMemOperand::MOLoad);
2167 
2168         for (unsigned j = 0; j < NumElts; ++j) {
2169           ProxyRegOps.push_back(RetVal.getValue(j));
2170 
2171           if (needTruncate)
2172             ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2173           else
2174             ProxyRegTruncates.push_back(std::optional<MVT>());
2175         }
2176 
2177         Chain = RetVal.getValue(NumElts);
2178         InGlue = RetVal.getValue(NumElts + 1);
2179 
2180         // Cleanup
2181         VecIdx = -1;
2182         LoadVTs.clear();
2183       }
2184     }
2185   }
2186 
2187   Chain =
2188       DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2189   InGlue = Chain.getValue(1);
2190 
2191   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2192   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2193   // dangling.
2194   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2195     SDValue Ret = DAG.getNode(
2196       NVPTXISD::ProxyReg, dl,
2197       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2198       { Chain, ProxyRegOps[i], InGlue }
2199     );
2200 
2201     Chain = Ret.getValue(1);
2202     InGlue = Ret.getValue(2);
2203 
2204     if (ProxyRegTruncates[i]) {
2205       Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2206     }
2207 
2208     InVals.push_back(Ret);
2209   }
2210 
2211   // set isTailCall to false for now, until we figure out how to express
2212   // tail call optimization in PTX
2213   isTailCall = false;
2214   return Chain;
2215 }
2216 
2217 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2218                                                      SelectionDAG &DAG) const {
2219   const Function &Fn = DAG.getMachineFunction().getFunction();
2220 
2221   DiagnosticInfoUnsupported NoDynamicAlloca(
2222       Fn, "dynamic alloca unsupported by NVPTX backend",
2223       SDLoc(Op).getDebugLoc());
2224   DAG.getContext()->diagnose(NoDynamicAlloca);
2225   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
2226   return DAG.getMergeValues(Ops, SDLoc());
2227 }
2228 
2229 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2230 // (see LegalizeDAG.cpp). This is slow and uses local memory.
2231 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2232 SDValue
2233 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2234   SDNode *Node = Op.getNode();
2235   SDLoc dl(Node);
2236   SmallVector<SDValue, 8> Ops;
2237   unsigned NumOperands = Node->getNumOperands();
2238   for (unsigned i = 0; i < NumOperands; ++i) {
2239     SDValue SubOp = Node->getOperand(i);
2240     EVT VVT = SubOp.getNode()->getValueType(0);
2241     EVT EltVT = VVT.getVectorElementType();
2242     unsigned NumSubElem = VVT.getVectorNumElements();
2243     for (unsigned j = 0; j < NumSubElem; ++j) {
2244       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2245                                 DAG.getIntPtrConstant(j, dl)));
2246     }
2247   }
2248   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2249 }
2250 
2251 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
2252 // would get lowered as two constant loads and vector-packing move.
2253 // Instead we want just a constant move:
2254 //        mov.b32         %r2, 0x40003C00
2255 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2256                                                SelectionDAG &DAG) const {
2257   EVT VT = Op->getValueType(0);
2258   if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2259     return Op;
2260 
2261   SDLoc DL(Op);
2262 
2263   if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2264         return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2265                isa<ConstantFPSDNode>(Operand);
2266       })) {
2267     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2268     // to optimize calculation of constant parts.
2269     if (VT == MVT::v4i8) {
2270       SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2271       SDValue E01 = DAG.getNode(
2272           NVPTXISD::BFI, DL, MVT::i32,
2273           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2274           DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2275       SDValue E012 =
2276           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2277                       DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2278                       E01, DAG.getConstant(16, DL, MVT::i32), C8);
2279       SDValue E0123 =
2280           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2281                       DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2282                       E012, DAG.getConstant(24, DL, MVT::i32), C8);
2283       return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2284     }
2285     return Op;
2286   }
2287 
2288   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2289   auto GetOperand = [](SDValue Op, int N) -> APInt {
2290     const SDValue &Operand = Op->getOperand(N);
2291     EVT VT = Op->getValueType(0);
2292     if (Operand->isUndef())
2293       return APInt(32, 0);
2294     APInt Value;
2295     if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2296       Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2297     else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2298       Value = Operand->getAsAPIntVal();
2299     else
2300       llvm_unreachable("Unsupported type");
2301     // i8 values are carried around as i16, so we need to zero out upper bits,
2302     // so they do not get in the way of combining individual byte values
2303     if (VT == MVT::v4i8)
2304       Value = Value.trunc(8);
2305     return Value.zext(32);
2306   };
2307   APInt Value;
2308   if (Isv2x16VT(VT)) {
2309     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2310   } else if (VT == MVT::v4i8) {
2311     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2312             GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2313   } else {
2314     llvm_unreachable("Unsupported type");
2315   }
2316   SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2317   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2318 }
2319 
2320 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2321                                                      SelectionDAG &DAG) const {
2322   SDValue Index = Op->getOperand(1);
2323   SDValue Vector = Op->getOperand(0);
2324   SDLoc DL(Op);
2325   EVT VectorVT = Vector.getValueType();
2326 
2327   if (VectorVT == MVT::v4i8) {
2328     SDValue BFE =
2329         DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2330                     {Vector,
2331                      DAG.getNode(ISD::MUL, DL, MVT::i32,
2332                                  DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2333                                  DAG.getConstant(8, DL, MVT::i32)),
2334                      DAG.getConstant(8, DL, MVT::i32)});
2335     return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2336   }
2337 
2338   // Constant index will be matched by tablegen.
2339   if (isa<ConstantSDNode>(Index.getNode()))
2340     return Op;
2341 
2342   // Extract individual elements and select one of them.
2343   assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2344   EVT EltVT = VectorVT.getVectorElementType();
2345 
2346   SDLoc dl(Op.getNode());
2347   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2348                            DAG.getIntPtrConstant(0, dl));
2349   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2350                            DAG.getIntPtrConstant(1, dl));
2351   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2352                          ISD::CondCode::SETEQ);
2353 }
2354 
2355 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2356                                                     SelectionDAG &DAG) const {
2357   SDValue Vector = Op->getOperand(0);
2358   EVT VectorVT = Vector.getValueType();
2359 
2360   if (VectorVT != MVT::v4i8)
2361     return Op;
2362   SDLoc DL(Op);
2363   SDValue Value = Op->getOperand(1);
2364   if (Value->isUndef())
2365     return Vector;
2366 
2367   SDValue Index = Op->getOperand(2);
2368 
2369   SDValue BFI =
2370       DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2371                   {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2372                    DAG.getNode(ISD::MUL, DL, MVT::i32,
2373                                DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2374                                DAG.getConstant(8, DL, MVT::i32)),
2375                    DAG.getConstant(8, DL, MVT::i32)});
2376   return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2377 }
2378 
2379 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2380                                                  SelectionDAG &DAG) const {
2381   SDValue V1 = Op.getOperand(0);
2382   EVT VectorVT = V1.getValueType();
2383   if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2384     return Op;
2385 
2386   // Lower shuffle to PRMT instruction.
2387   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2388   SDValue V2 = Op.getOperand(1);
2389   uint32_t Selector = 0;
2390   for (auto I : llvm::enumerate(SVN->getMask()))
2391     Selector |= (I.value() << (I.index() * 4));
2392 
2393   SDLoc DL(Op);
2394   return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2395                      DAG.getConstant(Selector, DL, MVT::i32),
2396                      DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2397 }
2398 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2399 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2400 ///    amount, or
2401 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2402 ///    amount.
2403 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2404                                                   SelectionDAG &DAG) const {
2405   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2406   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2407 
2408   EVT VT = Op.getValueType();
2409   unsigned VTBits = VT.getSizeInBits();
2410   SDLoc dl(Op);
2411   SDValue ShOpLo = Op.getOperand(0);
2412   SDValue ShOpHi = Op.getOperand(1);
2413   SDValue ShAmt  = Op.getOperand(2);
2414   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2415 
2416   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2417     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2418     // {dHi, dLo} = {aHi, aLo} >> Amt
2419     //   dHi = aHi >> Amt
2420     //   dLo = shf.r.clamp aLo, aHi, Amt
2421 
2422     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2423     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2424                              ShAmt);
2425 
2426     SDValue Ops[2] = { Lo, Hi };
2427     return DAG.getMergeValues(Ops, dl);
2428   }
2429   else {
2430     // {dHi, dLo} = {aHi, aLo} >> Amt
2431     // - if (Amt>=size) then
2432     //      dLo = aHi >> (Amt-size)
2433     //      dHi = aHi >> Amt (this is either all 0 or all 1)
2434     //   else
2435     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2436     //      dHi = aHi >> Amt
2437 
2438     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2439                                    DAG.getConstant(VTBits, dl, MVT::i32),
2440                                    ShAmt);
2441     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2442     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2443                                      DAG.getConstant(VTBits, dl, MVT::i32));
2444     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2445     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2446     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2447 
2448     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2449                                DAG.getConstant(VTBits, dl, MVT::i32),
2450                                ISD::SETGE);
2451     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2452     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2453 
2454     SDValue Ops[2] = { Lo, Hi };
2455     return DAG.getMergeValues(Ops, dl);
2456   }
2457 }
2458 
2459 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2460 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2461 ///    amount, or
2462 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2463 ///    amount.
2464 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2465                                                  SelectionDAG &DAG) const {
2466   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2467   assert(Op.getOpcode() == ISD::SHL_PARTS);
2468 
2469   EVT VT = Op.getValueType();
2470   unsigned VTBits = VT.getSizeInBits();
2471   SDLoc dl(Op);
2472   SDValue ShOpLo = Op.getOperand(0);
2473   SDValue ShOpHi = Op.getOperand(1);
2474   SDValue ShAmt  = Op.getOperand(2);
2475 
2476   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2477     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2478     // {dHi, dLo} = {aHi, aLo} << Amt
2479     //   dHi = shf.l.clamp aLo, aHi, Amt
2480     //   dLo = aLo << Amt
2481 
2482     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2483                              ShAmt);
2484     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2485 
2486     SDValue Ops[2] = { Lo, Hi };
2487     return DAG.getMergeValues(Ops, dl);
2488   }
2489   else {
2490     // {dHi, dLo} = {aHi, aLo} << Amt
2491     // - if (Amt>=size) then
2492     //      dLo = aLo << Amt (all 0)
2493     //      dLo = aLo << (Amt-size)
2494     //   else
2495     //      dLo = aLo << Amt
2496     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2497 
2498     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2499                                    DAG.getConstant(VTBits, dl, MVT::i32),
2500                                    ShAmt);
2501     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2502     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2503                                      DAG.getConstant(VTBits, dl, MVT::i32));
2504     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2505     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2506     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2507 
2508     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2509                                DAG.getConstant(VTBits, dl, MVT::i32),
2510                                ISD::SETGE);
2511     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2512     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2513 
2514     SDValue Ops[2] = { Lo, Hi };
2515     return DAG.getMergeValues(Ops, dl);
2516   }
2517 }
2518 
2519 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2520   EVT VT = Op.getValueType();
2521 
2522   if (VT == MVT::f32)
2523     return LowerFROUND32(Op, DAG);
2524 
2525   if (VT == MVT::f64)
2526     return LowerFROUND64(Op, DAG);
2527 
2528   llvm_unreachable("unhandled type");
2529 }
2530 
2531 // This is the the rounding method used in CUDA libdevice in C like code:
2532 // float roundf(float A)
2533 // {
2534 //   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2535 //   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2536 //   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2537 // }
2538 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2539                                            SelectionDAG &DAG) const {
2540   SDLoc SL(Op);
2541   SDValue A = Op.getOperand(0);
2542   EVT VT = Op.getValueType();
2543 
2544   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2545 
2546   // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2547   SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2548   const int SignBitMask = 0x80000000;
2549   SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2550                              DAG.getConstant(SignBitMask, SL, MVT::i32));
2551   const int PointFiveInBits = 0x3F000000;
2552   SDValue PointFiveWithSignRaw =
2553       DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2554                   DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2555   SDValue PointFiveWithSign =
2556       DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2557   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2558   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2559 
2560   // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2561   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2562   SDValue IsLarge =
2563       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2564                    ISD::SETOGT);
2565   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2566 
2567   // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2568   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2569                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2570   SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2571   return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2572 }
2573 
2574 // The implementation of round(double) is similar to that of round(float) in
2575 // that they both separate the value range into three regions and use a method
2576 // specific to the region to round the values. However, round(double) first
2577 // calculates the round of the absolute value and then adds the sign back while
2578 // round(float) directly rounds the value with sign.
2579 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2580                                            SelectionDAG &DAG) const {
2581   SDLoc SL(Op);
2582   SDValue A = Op.getOperand(0);
2583   EVT VT = Op.getValueType();
2584 
2585   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2586 
2587   // double RoundedA = (double) (int) (abs(A) + 0.5f);
2588   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2589                                   DAG.getConstantFP(0.5, SL, VT));
2590   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2591 
2592   // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2593   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2594   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2595                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2596   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2597                          DAG.getConstantFP(0, SL, VT),
2598                          RoundedA);
2599 
2600   // Add sign to rounded_A
2601   RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2602   DAG.getNode(ISD::FTRUNC, SL, VT, A);
2603 
2604   // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2605   SDValue IsLarge =
2606       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2607                    ISD::SETOGT);
2608   return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2609 }
2610 
2611 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2612                                             SelectionDAG &DAG) const {
2613   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2614 
2615   if (Op.getValueType() == MVT::bf16) {
2616     SDLoc Loc(Op);
2617     return DAG.getNode(
2618         ISD::FP_ROUND, Loc, MVT::bf16,
2619         DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2620         DAG.getIntPtrConstant(0, Loc));
2621   }
2622 
2623   // Everything else is considered legal.
2624   return Op;
2625 }
2626 
2627 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2628                                             SelectionDAG &DAG) const {
2629   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2630 
2631   if (Op.getOperand(0).getValueType() == MVT::bf16) {
2632     SDLoc Loc(Op);
2633     return DAG.getNode(
2634         Op.getOpcode(), Loc, Op.getValueType(),
2635         DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2636   }
2637 
2638   // Everything else is considered legal.
2639   return Op;
2640 }
2641 
2642 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2643   SDLoc DL(Op);
2644   if (Op.getValueType() != MVT::v2i16)
2645     return Op;
2646   EVT EltVT = Op.getValueType().getVectorElementType();
2647   SmallVector<SDValue> VecElements;
2648   for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2649     SmallVector<SDValue> ScalarArgs;
2650     llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2651                     [&](const SDUse &O) {
2652                       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2653                                          O.get(), DAG.getIntPtrConstant(I, DL));
2654                     });
2655     VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2656   }
2657   SDValue V =
2658       DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2659   return V;
2660 }
2661 
2662 SDValue
2663 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2664   switch (Op.getOpcode()) {
2665   case ISD::RETURNADDR:
2666     return SDValue();
2667   case ISD::FRAMEADDR:
2668     return SDValue();
2669   case ISD::GlobalAddress:
2670     return LowerGlobalAddress(Op, DAG);
2671   case ISD::INTRINSIC_W_CHAIN:
2672     return Op;
2673   case ISD::BUILD_VECTOR:
2674     return LowerBUILD_VECTOR(Op, DAG);
2675   case ISD::EXTRACT_SUBVECTOR:
2676     return Op;
2677   case ISD::EXTRACT_VECTOR_ELT:
2678     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2679   case ISD::INSERT_VECTOR_ELT:
2680     return LowerINSERT_VECTOR_ELT(Op, DAG);
2681   case ISD::VECTOR_SHUFFLE:
2682     return LowerVECTOR_SHUFFLE(Op, DAG);
2683   case ISD::CONCAT_VECTORS:
2684     return LowerCONCAT_VECTORS(Op, DAG);
2685   case ISD::STORE:
2686     return LowerSTORE(Op, DAG);
2687   case ISD::LOAD:
2688     return LowerLOAD(Op, DAG);
2689   case ISD::SHL_PARTS:
2690     return LowerShiftLeftParts(Op, DAG);
2691   case ISD::SRA_PARTS:
2692   case ISD::SRL_PARTS:
2693     return LowerShiftRightParts(Op, DAG);
2694   case ISD::SELECT:
2695     return LowerSelect(Op, DAG);
2696   case ISD::FROUND:
2697     return LowerFROUND(Op, DAG);
2698   case ISD::SINT_TO_FP:
2699   case ISD::UINT_TO_FP:
2700     return LowerINT_TO_FP(Op, DAG);
2701   case ISD::FP_TO_SINT:
2702   case ISD::FP_TO_UINT:
2703     return LowerFP_TO_INT(Op, DAG);
2704   case ISD::VAARG:
2705     return LowerVAARG(Op, DAG);
2706   case ISD::VASTART:
2707     return LowerVASTART(Op, DAG);
2708   case ISD::ABS:
2709   case ISD::SMIN:
2710   case ISD::SMAX:
2711   case ISD::UMIN:
2712   case ISD::UMAX:
2713   case ISD::ADD:
2714   case ISD::SUB:
2715   case ISD::MUL:
2716   case ISD::SHL:
2717   case ISD::SREM:
2718   case ISD::UREM:
2719     return LowerVectorArith(Op, DAG);
2720   case ISD::DYNAMIC_STACKALLOC:
2721     return LowerDYNAMIC_STACKALLOC(Op, DAG);
2722   default:
2723     llvm_unreachable("Custom lowering not defined for operation");
2724   }
2725 }
2726 
2727 // This function is almost a copy of SelectionDAG::expandVAArg().
2728 // The only diff is that this one produces loads from local address space.
2729 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2730   const TargetLowering *TLI = STI.getTargetLowering();
2731   SDLoc DL(Op);
2732 
2733   SDNode *Node = Op.getNode();
2734   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2735   EVT VT = Node->getValueType(0);
2736   auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2737   SDValue Tmp1 = Node->getOperand(0);
2738   SDValue Tmp2 = Node->getOperand(1);
2739   const MaybeAlign MA(Node->getConstantOperandVal(3));
2740 
2741   SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2742                                    Tmp1, Tmp2, MachinePointerInfo(V));
2743   SDValue VAList = VAListLoad;
2744 
2745   if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2746     VAList = DAG.getNode(
2747         ISD::ADD, DL, VAList.getValueType(), VAList,
2748         DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2749 
2750     VAList = DAG.getNode(
2751         ISD::AND, DL, VAList.getValueType(), VAList,
2752         DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2753   }
2754 
2755   // Increment the pointer, VAList, to the next vaarg
2756   Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2757                      DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2758                                      DL, VAList.getValueType()));
2759 
2760   // Store the incremented VAList to the legalized pointer
2761   Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2762                       MachinePointerInfo(V));
2763 
2764   const Value *SrcV =
2765       Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2766 
2767   // Load the actual argument out of the pointer VAList
2768   return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2769 }
2770 
2771 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2772   const TargetLowering *TLI = STI.getTargetLowering();
2773   SDLoc DL(Op);
2774   EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2775 
2776   // Store the address of unsized array <function>_vararg[] in the ap object.
2777   SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2778   SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2779 
2780   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2781   return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2782                       MachinePointerInfo(SV));
2783 }
2784 
2785 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2786   SDValue Op0 = Op->getOperand(0);
2787   SDValue Op1 = Op->getOperand(1);
2788   SDValue Op2 = Op->getOperand(2);
2789   SDLoc DL(Op.getNode());
2790 
2791   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2792 
2793   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2794   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2795   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2796   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2797 
2798   return Trunc;
2799 }
2800 
2801 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2802   if (Op.getValueType() == MVT::i1)
2803     return LowerLOADi1(Op, DAG);
2804 
2805   // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2806   // unaligned loads and have to handle it here.
2807   EVT VT = Op.getValueType();
2808   if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2809     LoadSDNode *Load = cast<LoadSDNode>(Op);
2810     EVT MemVT = Load->getMemoryVT();
2811     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2812                                         MemVT, *Load->getMemOperand())) {
2813       SDValue Ops[2];
2814       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2815       return DAG.getMergeValues(Ops, SDLoc(Op));
2816     }
2817   }
2818 
2819   return SDValue();
2820 }
2821 
2822 // v = ld i1* addr
2823 //   =>
2824 // v1 = ld i8* addr (-> i16)
2825 // v = trunc i16 to i1
2826 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2827   SDNode *Node = Op.getNode();
2828   LoadSDNode *LD = cast<LoadSDNode>(Node);
2829   SDLoc dl(Node);
2830   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2831   assert(Node->getValueType(0) == MVT::i1 &&
2832          "Custom lowering for i1 load only");
2833   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2834                               LD->getPointerInfo(), LD->getAlign(),
2835                               LD->getMemOperand()->getFlags());
2836   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2837   // The legalizer (the caller) is expecting two values from the legalized
2838   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2839   // in LegalizeDAG.cpp which also uses MergeValues.
2840   SDValue Ops[] = { result, LD->getChain() };
2841   return DAG.getMergeValues(Ops, dl);
2842 }
2843 
2844 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2845   StoreSDNode *Store = cast<StoreSDNode>(Op);
2846   EVT VT = Store->getMemoryVT();
2847 
2848   if (VT == MVT::i1)
2849     return LowerSTOREi1(Op, DAG);
2850 
2851   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2852   // stores and have to handle it here.
2853   if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2854       !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2855                                       VT, *Store->getMemOperand()))
2856     return expandUnalignedStore(Store, DAG);
2857 
2858   // v2f16, v2bf16 and v2i16 don't need special handling.
2859   if (Isv2x16VT(VT) || VT == MVT::v4i8)
2860     return SDValue();
2861 
2862   if (VT.isVector())
2863     return LowerSTOREVector(Op, DAG);
2864 
2865   return SDValue();
2866 }
2867 
2868 SDValue
2869 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2870   SDNode *N = Op.getNode();
2871   SDValue Val = N->getOperand(1);
2872   SDLoc DL(N);
2873   EVT ValVT = Val.getValueType();
2874 
2875   if (ValVT.isVector()) {
2876     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2877     // legal.  We can (and should) split that into 2 stores of <2 x double> here
2878     // but I'm leaving that as a TODO for now.
2879     if (!ValVT.isSimple())
2880       return SDValue();
2881     switch (ValVT.getSimpleVT().SimpleTy) {
2882     default:
2883       return SDValue();
2884     case MVT::v2i8:
2885     case MVT::v2i16:
2886     case MVT::v2i32:
2887     case MVT::v2i64:
2888     case MVT::v2f16:
2889     case MVT::v2bf16:
2890     case MVT::v2f32:
2891     case MVT::v2f64:
2892     case MVT::v4i8:
2893     case MVT::v4i16:
2894     case MVT::v4i32:
2895     case MVT::v4f16:
2896     case MVT::v4bf16:
2897     case MVT::v4f32:
2898     case MVT::v8f16: // <4 x f16x2>
2899     case MVT::v8bf16: // <4 x bf16x2>
2900     case MVT::v8i16:  // <4 x i16x2>
2901       // This is a "native" vector type
2902       break;
2903     }
2904 
2905     MemSDNode *MemSD = cast<MemSDNode>(N);
2906     const DataLayout &TD = DAG.getDataLayout();
2907 
2908     Align Alignment = MemSD->getAlign();
2909     Align PrefAlign =
2910         TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2911     if (Alignment < PrefAlign) {
2912       // This store is not sufficiently aligned, so bail out and let this vector
2913       // store be scalarized.  Note that we may still be able to emit smaller
2914       // vector stores.  For example, if we are storing a <4 x float> with an
2915       // alignment of 8, this check will fail but the legalizer will try again
2916       // with 2 x <2 x float>, which will succeed with an alignment of 8.
2917       return SDValue();
2918     }
2919 
2920     unsigned Opcode = 0;
2921     EVT EltVT = ValVT.getVectorElementType();
2922     unsigned NumElts = ValVT.getVectorNumElements();
2923 
2924     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2925     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2926     // stored type to i16 and propagate the "real" type as the memory type.
2927     bool NeedExt = false;
2928     if (EltVT.getSizeInBits() < 16)
2929       NeedExt = true;
2930 
2931     bool StoreF16x2 = false;
2932     switch (NumElts) {
2933     default:
2934       return SDValue();
2935     case 2:
2936       Opcode = NVPTXISD::StoreV2;
2937       break;
2938     case 4:
2939       Opcode = NVPTXISD::StoreV4;
2940       break;
2941     case 8:
2942       // v8f16 is a special case. PTX doesn't have st.v8.f16
2943       // instruction. Instead, we split the vector into v2f16 chunks and
2944       // store them with st.v4.b32.
2945       assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
2946       Opcode = NVPTXISD::StoreV4;
2947       StoreF16x2 = true;
2948       break;
2949     }
2950 
2951     SmallVector<SDValue, 8> Ops;
2952 
2953     // First is the chain
2954     Ops.push_back(N->getOperand(0));
2955 
2956     if (StoreF16x2) {
2957       // Combine f16,f16 -> v2f16
2958       NumElts /= 2;
2959       for (unsigned i = 0; i < NumElts; ++i) {
2960         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2961                                  DAG.getIntPtrConstant(i * 2, DL));
2962         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2963                                  DAG.getIntPtrConstant(i * 2 + 1, DL));
2964         EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
2965         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
2966         Ops.push_back(V2);
2967       }
2968     } else {
2969       // Then the split values
2970       for (unsigned i = 0; i < NumElts; ++i) {
2971         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2972                                      DAG.getIntPtrConstant(i, DL));
2973         if (NeedExt)
2974           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2975         Ops.push_back(ExtVal);
2976       }
2977     }
2978 
2979     // Then any remaining arguments
2980     Ops.append(N->op_begin() + 2, N->op_end());
2981 
2982     SDValue NewSt =
2983         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2984                                 MemSD->getMemoryVT(), MemSD->getMemOperand());
2985 
2986     // return DCI.CombineTo(N, NewSt, true);
2987     return NewSt;
2988   }
2989 
2990   return SDValue();
2991 }
2992 
2993 // st i1 v, addr
2994 //    =>
2995 // v1 = zxt v to i16
2996 // st.u8 i16, addr
2997 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2998   SDNode *Node = Op.getNode();
2999   SDLoc dl(Node);
3000   StoreSDNode *ST = cast<StoreSDNode>(Node);
3001   SDValue Tmp1 = ST->getChain();
3002   SDValue Tmp2 = ST->getBasePtr();
3003   SDValue Tmp3 = ST->getValue();
3004   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3005   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3006   SDValue Result =
3007       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3008                         ST->getAlign(), ST->getMemOperand()->getFlags());
3009   return Result;
3010 }
3011 
3012 // This creates target external symbol for a function parameter.
3013 // Name of the symbol is composed from its index and the function name.
3014 // Negative index corresponds to special parameter (unsized array) used for
3015 // passing variable arguments.
3016 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3017                                             EVT v) const {
3018   StringRef SavedStr = nvTM->getStrPool().save(
3019       getParamName(&DAG.getMachineFunction().getFunction(), idx));
3020   return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3021 }
3022 
3023 SDValue NVPTXTargetLowering::LowerFormalArguments(
3024     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3025     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3026     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3027   MachineFunction &MF = DAG.getMachineFunction();
3028   const DataLayout &DL = DAG.getDataLayout();
3029   auto PtrVT = getPointerTy(DAG.getDataLayout());
3030 
3031   const Function *F = &MF.getFunction();
3032   const AttributeList &PAL = F->getAttributes();
3033   const TargetLowering *TLI = STI.getTargetLowering();
3034 
3035   SDValue Root = DAG.getRoot();
3036   std::vector<SDValue> OutChains;
3037 
3038   bool isABI = (STI.getSmVersion() >= 20);
3039   assert(isABI && "Non-ABI compilation is not supported");
3040   if (!isABI)
3041     return Chain;
3042 
3043   std::vector<Type *> argTypes;
3044   std::vector<const Argument *> theArgs;
3045   for (const Argument &I : F->args()) {
3046     theArgs.push_back(&I);
3047     argTypes.push_back(I.getType());
3048   }
3049   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3050   // Ins.size() will be larger
3051   //   * if there is an aggregate argument with multiple fields (each field
3052   //     showing up separately in Ins)
3053   //   * if there is a vector argument with more than typical vector-length
3054   //     elements (generally if more than 4) where each vector element is
3055   //     individually present in Ins.
3056   // So a different index should be used for indexing into Ins.
3057   // See similar issue in LowerCall.
3058   unsigned InsIdx = 0;
3059 
3060   int idx = 0;
3061   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
3062     Type *Ty = argTypes[i];
3063 
3064     if (theArgs[i]->use_empty()) {
3065       // argument is dead
3066       if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3067         SmallVector<EVT, 16> vtparts;
3068 
3069         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3070         if (vtparts.empty())
3071           report_fatal_error("Empty parameter types are not supported");
3072 
3073         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3074              ++parti) {
3075           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3076           ++InsIdx;
3077         }
3078         if (vtparts.size() > 0)
3079           --InsIdx;
3080         continue;
3081       }
3082       if (Ty->isVectorTy()) {
3083         EVT ObjectVT = getValueType(DL, Ty);
3084         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3085         for (unsigned parti = 0; parti < NumRegs; ++parti) {
3086           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3087           ++InsIdx;
3088         }
3089         if (NumRegs > 0)
3090           --InsIdx;
3091         continue;
3092       }
3093       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3094       continue;
3095     }
3096 
3097     // In the following cases, assign a node order of "idx+1"
3098     // to newly created nodes. The SDNodes for params have to
3099     // appear in the same order as their order of appearance
3100     // in the original function. "idx+1" holds that order.
3101     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3102       bool aggregateIsPacked = false;
3103       if (StructType *STy = dyn_cast<StructType>(Ty))
3104         aggregateIsPacked = STy->isPacked();
3105 
3106       SmallVector<EVT, 16> VTs;
3107       SmallVector<uint64_t, 16> Offsets;
3108       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3109       if (VTs.empty())
3110         report_fatal_error("Empty parameter types are not supported");
3111 
3112       auto VectorInfo =
3113           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
3114 
3115       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3116       int VecIdx = -1; // Index of the first element of the current vector.
3117       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3118         if (VectorInfo[parti] & PVF_FIRST) {
3119           assert(VecIdx == -1 && "Orphaned vector.");
3120           VecIdx = parti;
3121         }
3122 
3123         // That's the last element of this store op.
3124         if (VectorInfo[parti] & PVF_LAST) {
3125           unsigned NumElts = parti - VecIdx + 1;
3126           EVT EltVT = VTs[parti];
3127           // i1 is loaded/stored as i8.
3128           EVT LoadVT = EltVT;
3129           if (EltVT == MVT::i1)
3130             LoadVT = MVT::i8;
3131           else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3132             // getLoad needs a vector type, but it can't handle
3133             // vectors which contain v2f16 or v2bf16 elements. So we must load
3134             // using i32 here and then bitcast back.
3135             LoadVT = MVT::i32;
3136 
3137           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3138           SDValue VecAddr =
3139               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3140                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3141           Value *srcValue = Constant::getNullValue(PointerType::get(
3142               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3143           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3144                                   MachinePointerInfo(srcValue),
3145                                   MaybeAlign(aggregateIsPacked ? 1 : 0),
3146                                   MachineMemOperand::MODereferenceable |
3147                                       MachineMemOperand::MOInvariant);
3148           if (P.getNode())
3149             P.getNode()->setIROrder(idx + 1);
3150           for (unsigned j = 0; j < NumElts; ++j) {
3151             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3152                                       DAG.getIntPtrConstant(j, dl));
3153             // We've loaded i1 as an i8 and now must truncate it back to i1
3154             if (EltVT == MVT::i1)
3155               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3156             // v2f16 was loaded as an i32. Now we must bitcast it back.
3157             else if (EltVT != LoadVT)
3158               Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3159 
3160             // If a promoted integer type is used, truncate down to the original
3161             MVT PromotedVT;
3162             if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3163               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3164             }
3165 
3166             // Extend the element if necessary (e.g. an i8 is loaded
3167             // into an i16 register)
3168             if (Ins[InsIdx].VT.isInteger() &&
3169                 Ins[InsIdx].VT.getFixedSizeInBits() >
3170                     LoadVT.getFixedSizeInBits()) {
3171               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3172                                                            : ISD::ZERO_EXTEND;
3173               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3174             }
3175             InVals.push_back(Elt);
3176           }
3177 
3178           // Reset vector tracking state.
3179           VecIdx = -1;
3180         }
3181         ++InsIdx;
3182       }
3183       if (VTs.size() > 0)
3184         --InsIdx;
3185       continue;
3186     }
3187 
3188     // Param has ByVal attribute
3189     // Return MoveParam(param symbol).
3190     // Ideally, the param symbol can be returned directly,
3191     // but when SDNode builder decides to use it in a CopyToReg(),
3192     // machine instruction fails because TargetExternalSymbol
3193     // (not lowered) is target dependent, and CopyToReg assumes
3194     // the source is lowered.
3195     EVT ObjectVT = getValueType(DL, Ty);
3196     assert(ObjectVT == Ins[InsIdx].VT &&
3197            "Ins type did not match function type");
3198     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3199     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3200     if (p.getNode())
3201       p.getNode()->setIROrder(idx + 1);
3202     InVals.push_back(p);
3203   }
3204 
3205   if (!OutChains.empty())
3206     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3207 
3208   return Chain;
3209 }
3210 
3211 SDValue
3212 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3213                                  bool isVarArg,
3214                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
3215                                  const SmallVectorImpl<SDValue> &OutVals,
3216                                  const SDLoc &dl, SelectionDAG &DAG) const {
3217   const MachineFunction &MF = DAG.getMachineFunction();
3218   const Function &F = MF.getFunction();
3219   Type *RetTy = MF.getFunction().getReturnType();
3220 
3221   bool isABI = (STI.getSmVersion() >= 20);
3222   assert(isABI && "Non-ABI compilation is not supported");
3223   if (!isABI)
3224     return Chain;
3225 
3226   const DataLayout &DL = DAG.getDataLayout();
3227   SmallVector<SDValue, 16> PromotedOutVals;
3228   SmallVector<EVT, 16> VTs;
3229   SmallVector<uint64_t, 16> Offsets;
3230   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3231   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3232 
3233   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3234     SDValue PromotedOutVal = OutVals[i];
3235     MVT PromotedVT;
3236     if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3237       VTs[i] = EVT(PromotedVT);
3238     }
3239     if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3240       llvm::ISD::NodeType Ext =
3241           Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3242       PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3243     }
3244     PromotedOutVals.push_back(PromotedOutVal);
3245   }
3246 
3247   auto VectorInfo = VectorizePTXValueVTs(
3248       VTs, Offsets,
3249       RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3250                        : Align(1));
3251 
3252   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3253   // 32-bits are sign extended or zero extended, depending on whether
3254   // they are signed or unsigned types.
3255   bool ExtendIntegerRetVal =
3256       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3257 
3258   SmallVector<SDValue, 6> StoreOperands;
3259   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3260     // New load/store. Record chain and offset operands.
3261     if (VectorInfo[i] & PVF_FIRST) {
3262       assert(StoreOperands.empty() && "Orphaned operand list.");
3263       StoreOperands.push_back(Chain);
3264       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3265     }
3266 
3267     SDValue OutVal = OutVals[i];
3268     SDValue RetVal = PromotedOutVals[i];
3269 
3270     if (ExtendIntegerRetVal) {
3271       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3272                                                   : ISD::ZERO_EXTEND,
3273                            dl, MVT::i32, RetVal);
3274     } else if (OutVal.getValueSizeInBits() < 16) {
3275       // Use 16-bit registers for small load-stores as it's the
3276       // smallest general purpose register size supported by NVPTX.
3277       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3278     }
3279 
3280     // Record the value to return.
3281     StoreOperands.push_back(RetVal);
3282 
3283     // That's the last element of this store op.
3284     if (VectorInfo[i] & PVF_LAST) {
3285       NVPTXISD::NodeType Op;
3286       unsigned NumElts = StoreOperands.size() - 2;
3287       switch (NumElts) {
3288       case 1:
3289         Op = NVPTXISD::StoreRetval;
3290         break;
3291       case 2:
3292         Op = NVPTXISD::StoreRetvalV2;
3293         break;
3294       case 4:
3295         Op = NVPTXISD::StoreRetvalV4;
3296         break;
3297       default:
3298         llvm_unreachable("Invalid vector info.");
3299       }
3300 
3301       // Adjust type of load/store op if we've extended the scalar
3302       // return value.
3303       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3304       Chain = DAG.getMemIntrinsicNode(
3305           Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3306           MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3307       // Cleanup vector state.
3308       StoreOperands.clear();
3309     }
3310   }
3311 
3312   return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3313 }
3314 
3315 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3316     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3317     SelectionDAG &DAG) const {
3318   if (Constraint.size() > 1)
3319     return;
3320   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3321 }
3322 
3323 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3324   switch (Intrinsic) {
3325   default:
3326     return 0;
3327 
3328   case Intrinsic::nvvm_tex_1d_v4f32_s32:
3329     return NVPTXISD::Tex1DFloatS32;
3330   case Intrinsic::nvvm_tex_1d_v4f32_f32:
3331     return NVPTXISD::Tex1DFloatFloat;
3332   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3333     return NVPTXISD::Tex1DFloatFloatLevel;
3334   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3335     return NVPTXISD::Tex1DFloatFloatGrad;
3336   case Intrinsic::nvvm_tex_1d_v4s32_s32:
3337     return NVPTXISD::Tex1DS32S32;
3338   case Intrinsic::nvvm_tex_1d_v4s32_f32:
3339     return NVPTXISD::Tex1DS32Float;
3340   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3341     return NVPTXISD::Tex1DS32FloatLevel;
3342   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3343     return NVPTXISD::Tex1DS32FloatGrad;
3344   case Intrinsic::nvvm_tex_1d_v4u32_s32:
3345     return NVPTXISD::Tex1DU32S32;
3346   case Intrinsic::nvvm_tex_1d_v4u32_f32:
3347     return NVPTXISD::Tex1DU32Float;
3348   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3349     return NVPTXISD::Tex1DU32FloatLevel;
3350   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3351     return NVPTXISD::Tex1DU32FloatGrad;
3352 
3353   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3354     return NVPTXISD::Tex1DArrayFloatS32;
3355   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3356     return NVPTXISD::Tex1DArrayFloatFloat;
3357   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3358     return NVPTXISD::Tex1DArrayFloatFloatLevel;
3359   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3360     return NVPTXISD::Tex1DArrayFloatFloatGrad;
3361   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3362     return NVPTXISD::Tex1DArrayS32S32;
3363   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3364     return NVPTXISD::Tex1DArrayS32Float;
3365   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3366     return NVPTXISD::Tex1DArrayS32FloatLevel;
3367   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3368     return NVPTXISD::Tex1DArrayS32FloatGrad;
3369   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3370     return NVPTXISD::Tex1DArrayU32S32;
3371   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3372     return NVPTXISD::Tex1DArrayU32Float;
3373   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3374     return NVPTXISD::Tex1DArrayU32FloatLevel;
3375   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3376     return NVPTXISD::Tex1DArrayU32FloatGrad;
3377 
3378   case Intrinsic::nvvm_tex_2d_v4f32_s32:
3379     return NVPTXISD::Tex2DFloatS32;
3380   case Intrinsic::nvvm_tex_2d_v4f32_f32:
3381     return NVPTXISD::Tex2DFloatFloat;
3382   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3383     return NVPTXISD::Tex2DFloatFloatLevel;
3384   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3385     return NVPTXISD::Tex2DFloatFloatGrad;
3386   case Intrinsic::nvvm_tex_2d_v4s32_s32:
3387     return NVPTXISD::Tex2DS32S32;
3388   case Intrinsic::nvvm_tex_2d_v4s32_f32:
3389     return NVPTXISD::Tex2DS32Float;
3390   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3391     return NVPTXISD::Tex2DS32FloatLevel;
3392   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3393     return NVPTXISD::Tex2DS32FloatGrad;
3394   case Intrinsic::nvvm_tex_2d_v4u32_s32:
3395     return NVPTXISD::Tex2DU32S32;
3396   case Intrinsic::nvvm_tex_2d_v4u32_f32:
3397     return NVPTXISD::Tex2DU32Float;
3398   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3399     return NVPTXISD::Tex2DU32FloatLevel;
3400   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3401     return NVPTXISD::Tex2DU32FloatGrad;
3402 
3403   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3404     return NVPTXISD::Tex2DArrayFloatS32;
3405   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3406     return NVPTXISD::Tex2DArrayFloatFloat;
3407   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3408     return NVPTXISD::Tex2DArrayFloatFloatLevel;
3409   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3410     return NVPTXISD::Tex2DArrayFloatFloatGrad;
3411   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3412     return NVPTXISD::Tex2DArrayS32S32;
3413   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3414     return NVPTXISD::Tex2DArrayS32Float;
3415   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3416     return NVPTXISD::Tex2DArrayS32FloatLevel;
3417   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3418     return NVPTXISD::Tex2DArrayS32FloatGrad;
3419   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3420     return NVPTXISD::Tex2DArrayU32S32;
3421   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3422     return NVPTXISD::Tex2DArrayU32Float;
3423   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3424     return NVPTXISD::Tex2DArrayU32FloatLevel;
3425   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3426     return NVPTXISD::Tex2DArrayU32FloatGrad;
3427 
3428   case Intrinsic::nvvm_tex_3d_v4f32_s32:
3429     return NVPTXISD::Tex3DFloatS32;
3430   case Intrinsic::nvvm_tex_3d_v4f32_f32:
3431     return NVPTXISD::Tex3DFloatFloat;
3432   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3433     return NVPTXISD::Tex3DFloatFloatLevel;
3434   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3435     return NVPTXISD::Tex3DFloatFloatGrad;
3436   case Intrinsic::nvvm_tex_3d_v4s32_s32:
3437     return NVPTXISD::Tex3DS32S32;
3438   case Intrinsic::nvvm_tex_3d_v4s32_f32:
3439     return NVPTXISD::Tex3DS32Float;
3440   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3441     return NVPTXISD::Tex3DS32FloatLevel;
3442   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3443     return NVPTXISD::Tex3DS32FloatGrad;
3444   case Intrinsic::nvvm_tex_3d_v4u32_s32:
3445     return NVPTXISD::Tex3DU32S32;
3446   case Intrinsic::nvvm_tex_3d_v4u32_f32:
3447     return NVPTXISD::Tex3DU32Float;
3448   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3449     return NVPTXISD::Tex3DU32FloatLevel;
3450   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3451     return NVPTXISD::Tex3DU32FloatGrad;
3452 
3453   case Intrinsic::nvvm_tex_cube_v4f32_f32:
3454     return NVPTXISD::TexCubeFloatFloat;
3455   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3456     return NVPTXISD::TexCubeFloatFloatLevel;
3457   case Intrinsic::nvvm_tex_cube_v4s32_f32:
3458     return NVPTXISD::TexCubeS32Float;
3459   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3460     return NVPTXISD::TexCubeS32FloatLevel;
3461   case Intrinsic::nvvm_tex_cube_v4u32_f32:
3462     return NVPTXISD::TexCubeU32Float;
3463   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3464     return NVPTXISD::TexCubeU32FloatLevel;
3465 
3466   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3467     return NVPTXISD::TexCubeArrayFloatFloat;
3468   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3469     return NVPTXISD::TexCubeArrayFloatFloatLevel;
3470   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3471     return NVPTXISD::TexCubeArrayS32Float;
3472   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3473     return NVPTXISD::TexCubeArrayS32FloatLevel;
3474   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3475     return NVPTXISD::TexCubeArrayU32Float;
3476   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3477     return NVPTXISD::TexCubeArrayU32FloatLevel;
3478 
3479   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3480     return NVPTXISD::Tld4R2DFloatFloat;
3481   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3482     return NVPTXISD::Tld4G2DFloatFloat;
3483   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3484     return NVPTXISD::Tld4B2DFloatFloat;
3485   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3486     return NVPTXISD::Tld4A2DFloatFloat;
3487   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3488     return NVPTXISD::Tld4R2DS64Float;
3489   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3490     return NVPTXISD::Tld4G2DS64Float;
3491   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3492     return NVPTXISD::Tld4B2DS64Float;
3493   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3494     return NVPTXISD::Tld4A2DS64Float;
3495   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3496     return NVPTXISD::Tld4R2DU64Float;
3497   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3498     return NVPTXISD::Tld4G2DU64Float;
3499   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3500     return NVPTXISD::Tld4B2DU64Float;
3501   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3502     return NVPTXISD::Tld4A2DU64Float;
3503 
3504   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3505     return NVPTXISD::TexUnified1DFloatS32;
3506   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3507     return NVPTXISD::TexUnified1DFloatFloat;
3508   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3509     return NVPTXISD::TexUnified1DFloatFloatLevel;
3510   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3511     return NVPTXISD::TexUnified1DFloatFloatGrad;
3512   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3513     return NVPTXISD::TexUnified1DS32S32;
3514   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3515     return NVPTXISD::TexUnified1DS32Float;
3516   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3517     return NVPTXISD::TexUnified1DS32FloatLevel;
3518   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3519     return NVPTXISD::TexUnified1DS32FloatGrad;
3520   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3521     return NVPTXISD::TexUnified1DU32S32;
3522   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3523     return NVPTXISD::TexUnified1DU32Float;
3524   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3525     return NVPTXISD::TexUnified1DU32FloatLevel;
3526   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3527     return NVPTXISD::TexUnified1DU32FloatGrad;
3528 
3529   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3530     return NVPTXISD::TexUnified1DArrayFloatS32;
3531   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3532     return NVPTXISD::TexUnified1DArrayFloatFloat;
3533   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3534     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3535   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3536     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3537   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3538     return NVPTXISD::TexUnified1DArrayS32S32;
3539   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3540     return NVPTXISD::TexUnified1DArrayS32Float;
3541   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3542     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3543   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3544     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3545   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3546     return NVPTXISD::TexUnified1DArrayU32S32;
3547   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3548     return NVPTXISD::TexUnified1DArrayU32Float;
3549   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3550     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3551   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3552     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3553 
3554   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3555     return NVPTXISD::TexUnified2DFloatS32;
3556   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3557     return NVPTXISD::TexUnified2DFloatFloat;
3558   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3559     return NVPTXISD::TexUnified2DFloatFloatLevel;
3560   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3561     return NVPTXISD::TexUnified2DFloatFloatGrad;
3562   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3563     return NVPTXISD::TexUnified2DS32S32;
3564   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3565     return NVPTXISD::TexUnified2DS32Float;
3566   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3567     return NVPTXISD::TexUnified2DS32FloatLevel;
3568   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3569     return NVPTXISD::TexUnified2DS32FloatGrad;
3570   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3571     return NVPTXISD::TexUnified2DU32S32;
3572   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3573     return NVPTXISD::TexUnified2DU32Float;
3574   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3575     return NVPTXISD::TexUnified2DU32FloatLevel;
3576   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3577     return NVPTXISD::TexUnified2DU32FloatGrad;
3578 
3579   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3580     return NVPTXISD::TexUnified2DArrayFloatS32;
3581   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3582     return NVPTXISD::TexUnified2DArrayFloatFloat;
3583   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3584     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3585   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3586     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3587   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3588     return NVPTXISD::TexUnified2DArrayS32S32;
3589   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3590     return NVPTXISD::TexUnified2DArrayS32Float;
3591   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3592     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3593   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3594     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3595   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3596     return NVPTXISD::TexUnified2DArrayU32S32;
3597   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3598     return NVPTXISD::TexUnified2DArrayU32Float;
3599   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3600     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3601   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3602     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3603 
3604   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3605     return NVPTXISD::TexUnified3DFloatS32;
3606   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3607     return NVPTXISD::TexUnified3DFloatFloat;
3608   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3609     return NVPTXISD::TexUnified3DFloatFloatLevel;
3610   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3611     return NVPTXISD::TexUnified3DFloatFloatGrad;
3612   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3613     return NVPTXISD::TexUnified3DS32S32;
3614   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3615     return NVPTXISD::TexUnified3DS32Float;
3616   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3617     return NVPTXISD::TexUnified3DS32FloatLevel;
3618   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3619     return NVPTXISD::TexUnified3DS32FloatGrad;
3620   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3621     return NVPTXISD::TexUnified3DU32S32;
3622   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3623     return NVPTXISD::TexUnified3DU32Float;
3624   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3625     return NVPTXISD::TexUnified3DU32FloatLevel;
3626   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3627     return NVPTXISD::TexUnified3DU32FloatGrad;
3628 
3629   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3630     return NVPTXISD::TexUnifiedCubeFloatFloat;
3631   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3632     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3633   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3634     return NVPTXISD::TexUnifiedCubeS32Float;
3635   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3636     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3637   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3638     return NVPTXISD::TexUnifiedCubeU32Float;
3639   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3640     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3641 
3642   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3643     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3644   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3645     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3646   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3647     return NVPTXISD::TexUnifiedCubeArrayS32Float;
3648   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3649     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3650   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3651     return NVPTXISD::TexUnifiedCubeArrayU32Float;
3652   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3653     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3654 
3655   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3656     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3657   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3658     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3659   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3660     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3661   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3662     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3663   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3664     return NVPTXISD::Tld4UnifiedR2DS64Float;
3665   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3666     return NVPTXISD::Tld4UnifiedG2DS64Float;
3667   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3668     return NVPTXISD::Tld4UnifiedB2DS64Float;
3669   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3670     return NVPTXISD::Tld4UnifiedA2DS64Float;
3671   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3672     return NVPTXISD::Tld4UnifiedR2DU64Float;
3673   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3674     return NVPTXISD::Tld4UnifiedG2DU64Float;
3675   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3676     return NVPTXISD::Tld4UnifiedB2DU64Float;
3677   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3678     return NVPTXISD::Tld4UnifiedA2DU64Float;
3679   }
3680 }
3681 
3682 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3683   switch (Intrinsic) {
3684   default:
3685     return 0;
3686   case Intrinsic::nvvm_suld_1d_i8_clamp:
3687     return NVPTXISD::Suld1DI8Clamp;
3688   case Intrinsic::nvvm_suld_1d_i16_clamp:
3689     return NVPTXISD::Suld1DI16Clamp;
3690   case Intrinsic::nvvm_suld_1d_i32_clamp:
3691     return NVPTXISD::Suld1DI32Clamp;
3692   case Intrinsic::nvvm_suld_1d_i64_clamp:
3693     return NVPTXISD::Suld1DI64Clamp;
3694   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3695     return NVPTXISD::Suld1DV2I8Clamp;
3696   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3697     return NVPTXISD::Suld1DV2I16Clamp;
3698   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3699     return NVPTXISD::Suld1DV2I32Clamp;
3700   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3701     return NVPTXISD::Suld1DV2I64Clamp;
3702   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3703     return NVPTXISD::Suld1DV4I8Clamp;
3704   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3705     return NVPTXISD::Suld1DV4I16Clamp;
3706   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3707     return NVPTXISD::Suld1DV4I32Clamp;
3708   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3709     return NVPTXISD::Suld1DArrayI8Clamp;
3710   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3711     return NVPTXISD::Suld1DArrayI16Clamp;
3712   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3713     return NVPTXISD::Suld1DArrayI32Clamp;
3714   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3715     return NVPTXISD::Suld1DArrayI64Clamp;
3716   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3717     return NVPTXISD::Suld1DArrayV2I8Clamp;
3718   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3719     return NVPTXISD::Suld1DArrayV2I16Clamp;
3720   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3721     return NVPTXISD::Suld1DArrayV2I32Clamp;
3722   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3723     return NVPTXISD::Suld1DArrayV2I64Clamp;
3724   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3725     return NVPTXISD::Suld1DArrayV4I8Clamp;
3726   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3727     return NVPTXISD::Suld1DArrayV4I16Clamp;
3728   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3729     return NVPTXISD::Suld1DArrayV4I32Clamp;
3730   case Intrinsic::nvvm_suld_2d_i8_clamp:
3731     return NVPTXISD::Suld2DI8Clamp;
3732   case Intrinsic::nvvm_suld_2d_i16_clamp:
3733     return NVPTXISD::Suld2DI16Clamp;
3734   case Intrinsic::nvvm_suld_2d_i32_clamp:
3735     return NVPTXISD::Suld2DI32Clamp;
3736   case Intrinsic::nvvm_suld_2d_i64_clamp:
3737     return NVPTXISD::Suld2DI64Clamp;
3738   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3739     return NVPTXISD::Suld2DV2I8Clamp;
3740   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3741     return NVPTXISD::Suld2DV2I16Clamp;
3742   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3743     return NVPTXISD::Suld2DV2I32Clamp;
3744   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3745     return NVPTXISD::Suld2DV2I64Clamp;
3746   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3747     return NVPTXISD::Suld2DV4I8Clamp;
3748   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3749     return NVPTXISD::Suld2DV4I16Clamp;
3750   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3751     return NVPTXISD::Suld2DV4I32Clamp;
3752   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3753     return NVPTXISD::Suld2DArrayI8Clamp;
3754   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3755     return NVPTXISD::Suld2DArrayI16Clamp;
3756   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3757     return NVPTXISD::Suld2DArrayI32Clamp;
3758   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3759     return NVPTXISD::Suld2DArrayI64Clamp;
3760   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3761     return NVPTXISD::Suld2DArrayV2I8Clamp;
3762   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3763     return NVPTXISD::Suld2DArrayV2I16Clamp;
3764   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3765     return NVPTXISD::Suld2DArrayV2I32Clamp;
3766   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3767     return NVPTXISD::Suld2DArrayV2I64Clamp;
3768   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3769     return NVPTXISD::Suld2DArrayV4I8Clamp;
3770   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3771     return NVPTXISD::Suld2DArrayV4I16Clamp;
3772   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3773     return NVPTXISD::Suld2DArrayV4I32Clamp;
3774   case Intrinsic::nvvm_suld_3d_i8_clamp:
3775     return NVPTXISD::Suld3DI8Clamp;
3776   case Intrinsic::nvvm_suld_3d_i16_clamp:
3777     return NVPTXISD::Suld3DI16Clamp;
3778   case Intrinsic::nvvm_suld_3d_i32_clamp:
3779     return NVPTXISD::Suld3DI32Clamp;
3780   case Intrinsic::nvvm_suld_3d_i64_clamp:
3781     return NVPTXISD::Suld3DI64Clamp;
3782   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3783     return NVPTXISD::Suld3DV2I8Clamp;
3784   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3785     return NVPTXISD::Suld3DV2I16Clamp;
3786   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3787     return NVPTXISD::Suld3DV2I32Clamp;
3788   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3789     return NVPTXISD::Suld3DV2I64Clamp;
3790   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3791     return NVPTXISD::Suld3DV4I8Clamp;
3792   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3793     return NVPTXISD::Suld3DV4I16Clamp;
3794   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3795     return NVPTXISD::Suld3DV4I32Clamp;
3796   case Intrinsic::nvvm_suld_1d_i8_trap:
3797     return NVPTXISD::Suld1DI8Trap;
3798   case Intrinsic::nvvm_suld_1d_i16_trap:
3799     return NVPTXISD::Suld1DI16Trap;
3800   case Intrinsic::nvvm_suld_1d_i32_trap:
3801     return NVPTXISD::Suld1DI32Trap;
3802   case Intrinsic::nvvm_suld_1d_i64_trap:
3803     return NVPTXISD::Suld1DI64Trap;
3804   case Intrinsic::nvvm_suld_1d_v2i8_trap:
3805     return NVPTXISD::Suld1DV2I8Trap;
3806   case Intrinsic::nvvm_suld_1d_v2i16_trap:
3807     return NVPTXISD::Suld1DV2I16Trap;
3808   case Intrinsic::nvvm_suld_1d_v2i32_trap:
3809     return NVPTXISD::Suld1DV2I32Trap;
3810   case Intrinsic::nvvm_suld_1d_v2i64_trap:
3811     return NVPTXISD::Suld1DV2I64Trap;
3812   case Intrinsic::nvvm_suld_1d_v4i8_trap:
3813     return NVPTXISD::Suld1DV4I8Trap;
3814   case Intrinsic::nvvm_suld_1d_v4i16_trap:
3815     return NVPTXISD::Suld1DV4I16Trap;
3816   case Intrinsic::nvvm_suld_1d_v4i32_trap:
3817     return NVPTXISD::Suld1DV4I32Trap;
3818   case Intrinsic::nvvm_suld_1d_array_i8_trap:
3819     return NVPTXISD::Suld1DArrayI8Trap;
3820   case Intrinsic::nvvm_suld_1d_array_i16_trap:
3821     return NVPTXISD::Suld1DArrayI16Trap;
3822   case Intrinsic::nvvm_suld_1d_array_i32_trap:
3823     return NVPTXISD::Suld1DArrayI32Trap;
3824   case Intrinsic::nvvm_suld_1d_array_i64_trap:
3825     return NVPTXISD::Suld1DArrayI64Trap;
3826   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3827     return NVPTXISD::Suld1DArrayV2I8Trap;
3828   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3829     return NVPTXISD::Suld1DArrayV2I16Trap;
3830   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3831     return NVPTXISD::Suld1DArrayV2I32Trap;
3832   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3833     return NVPTXISD::Suld1DArrayV2I64Trap;
3834   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3835     return NVPTXISD::Suld1DArrayV4I8Trap;
3836   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3837     return NVPTXISD::Suld1DArrayV4I16Trap;
3838   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3839     return NVPTXISD::Suld1DArrayV4I32Trap;
3840   case Intrinsic::nvvm_suld_2d_i8_trap:
3841     return NVPTXISD::Suld2DI8Trap;
3842   case Intrinsic::nvvm_suld_2d_i16_trap:
3843     return NVPTXISD::Suld2DI16Trap;
3844   case Intrinsic::nvvm_suld_2d_i32_trap:
3845     return NVPTXISD::Suld2DI32Trap;
3846   case Intrinsic::nvvm_suld_2d_i64_trap:
3847     return NVPTXISD::Suld2DI64Trap;
3848   case Intrinsic::nvvm_suld_2d_v2i8_trap:
3849     return NVPTXISD::Suld2DV2I8Trap;
3850   case Intrinsic::nvvm_suld_2d_v2i16_trap:
3851     return NVPTXISD::Suld2DV2I16Trap;
3852   case Intrinsic::nvvm_suld_2d_v2i32_trap:
3853     return NVPTXISD::Suld2DV2I32Trap;
3854   case Intrinsic::nvvm_suld_2d_v2i64_trap:
3855     return NVPTXISD::Suld2DV2I64Trap;
3856   case Intrinsic::nvvm_suld_2d_v4i8_trap:
3857     return NVPTXISD::Suld2DV4I8Trap;
3858   case Intrinsic::nvvm_suld_2d_v4i16_trap:
3859     return NVPTXISD::Suld2DV4I16Trap;
3860   case Intrinsic::nvvm_suld_2d_v4i32_trap:
3861     return NVPTXISD::Suld2DV4I32Trap;
3862   case Intrinsic::nvvm_suld_2d_array_i8_trap:
3863     return NVPTXISD::Suld2DArrayI8Trap;
3864   case Intrinsic::nvvm_suld_2d_array_i16_trap:
3865     return NVPTXISD::Suld2DArrayI16Trap;
3866   case Intrinsic::nvvm_suld_2d_array_i32_trap:
3867     return NVPTXISD::Suld2DArrayI32Trap;
3868   case Intrinsic::nvvm_suld_2d_array_i64_trap:
3869     return NVPTXISD::Suld2DArrayI64Trap;
3870   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3871     return NVPTXISD::Suld2DArrayV2I8Trap;
3872   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3873     return NVPTXISD::Suld2DArrayV2I16Trap;
3874   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3875     return NVPTXISD::Suld2DArrayV2I32Trap;
3876   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3877     return NVPTXISD::Suld2DArrayV2I64Trap;
3878   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3879     return NVPTXISD::Suld2DArrayV4I8Trap;
3880   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3881     return NVPTXISD::Suld2DArrayV4I16Trap;
3882   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3883     return NVPTXISD::Suld2DArrayV4I32Trap;
3884   case Intrinsic::nvvm_suld_3d_i8_trap:
3885     return NVPTXISD::Suld3DI8Trap;
3886   case Intrinsic::nvvm_suld_3d_i16_trap:
3887     return NVPTXISD::Suld3DI16Trap;
3888   case Intrinsic::nvvm_suld_3d_i32_trap:
3889     return NVPTXISD::Suld3DI32Trap;
3890   case Intrinsic::nvvm_suld_3d_i64_trap:
3891     return NVPTXISD::Suld3DI64Trap;
3892   case Intrinsic::nvvm_suld_3d_v2i8_trap:
3893     return NVPTXISD::Suld3DV2I8Trap;
3894   case Intrinsic::nvvm_suld_3d_v2i16_trap:
3895     return NVPTXISD::Suld3DV2I16Trap;
3896   case Intrinsic::nvvm_suld_3d_v2i32_trap:
3897     return NVPTXISD::Suld3DV2I32Trap;
3898   case Intrinsic::nvvm_suld_3d_v2i64_trap:
3899     return NVPTXISD::Suld3DV2I64Trap;
3900   case Intrinsic::nvvm_suld_3d_v4i8_trap:
3901     return NVPTXISD::Suld3DV4I8Trap;
3902   case Intrinsic::nvvm_suld_3d_v4i16_trap:
3903     return NVPTXISD::Suld3DV4I16Trap;
3904   case Intrinsic::nvvm_suld_3d_v4i32_trap:
3905     return NVPTXISD::Suld3DV4I32Trap;
3906   case Intrinsic::nvvm_suld_1d_i8_zero:
3907     return NVPTXISD::Suld1DI8Zero;
3908   case Intrinsic::nvvm_suld_1d_i16_zero:
3909     return NVPTXISD::Suld1DI16Zero;
3910   case Intrinsic::nvvm_suld_1d_i32_zero:
3911     return NVPTXISD::Suld1DI32Zero;
3912   case Intrinsic::nvvm_suld_1d_i64_zero:
3913     return NVPTXISD::Suld1DI64Zero;
3914   case Intrinsic::nvvm_suld_1d_v2i8_zero:
3915     return NVPTXISD::Suld1DV2I8Zero;
3916   case Intrinsic::nvvm_suld_1d_v2i16_zero:
3917     return NVPTXISD::Suld1DV2I16Zero;
3918   case Intrinsic::nvvm_suld_1d_v2i32_zero:
3919     return NVPTXISD::Suld1DV2I32Zero;
3920   case Intrinsic::nvvm_suld_1d_v2i64_zero:
3921     return NVPTXISD::Suld1DV2I64Zero;
3922   case Intrinsic::nvvm_suld_1d_v4i8_zero:
3923     return NVPTXISD::Suld1DV4I8Zero;
3924   case Intrinsic::nvvm_suld_1d_v4i16_zero:
3925     return NVPTXISD::Suld1DV4I16Zero;
3926   case Intrinsic::nvvm_suld_1d_v4i32_zero:
3927     return NVPTXISD::Suld1DV4I32Zero;
3928   case Intrinsic::nvvm_suld_1d_array_i8_zero:
3929     return NVPTXISD::Suld1DArrayI8Zero;
3930   case Intrinsic::nvvm_suld_1d_array_i16_zero:
3931     return NVPTXISD::Suld1DArrayI16Zero;
3932   case Intrinsic::nvvm_suld_1d_array_i32_zero:
3933     return NVPTXISD::Suld1DArrayI32Zero;
3934   case Intrinsic::nvvm_suld_1d_array_i64_zero:
3935     return NVPTXISD::Suld1DArrayI64Zero;
3936   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3937     return NVPTXISD::Suld1DArrayV2I8Zero;
3938   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3939     return NVPTXISD::Suld1DArrayV2I16Zero;
3940   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3941     return NVPTXISD::Suld1DArrayV2I32Zero;
3942   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3943     return NVPTXISD::Suld1DArrayV2I64Zero;
3944   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3945     return NVPTXISD::Suld1DArrayV4I8Zero;
3946   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3947     return NVPTXISD::Suld1DArrayV4I16Zero;
3948   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3949     return NVPTXISD::Suld1DArrayV4I32Zero;
3950   case Intrinsic::nvvm_suld_2d_i8_zero:
3951     return NVPTXISD::Suld2DI8Zero;
3952   case Intrinsic::nvvm_suld_2d_i16_zero:
3953     return NVPTXISD::Suld2DI16Zero;
3954   case Intrinsic::nvvm_suld_2d_i32_zero:
3955     return NVPTXISD::Suld2DI32Zero;
3956   case Intrinsic::nvvm_suld_2d_i64_zero:
3957     return NVPTXISD::Suld2DI64Zero;
3958   case Intrinsic::nvvm_suld_2d_v2i8_zero:
3959     return NVPTXISD::Suld2DV2I8Zero;
3960   case Intrinsic::nvvm_suld_2d_v2i16_zero:
3961     return NVPTXISD::Suld2DV2I16Zero;
3962   case Intrinsic::nvvm_suld_2d_v2i32_zero:
3963     return NVPTXISD::Suld2DV2I32Zero;
3964   case Intrinsic::nvvm_suld_2d_v2i64_zero:
3965     return NVPTXISD::Suld2DV2I64Zero;
3966   case Intrinsic::nvvm_suld_2d_v4i8_zero:
3967     return NVPTXISD::Suld2DV4I8Zero;
3968   case Intrinsic::nvvm_suld_2d_v4i16_zero:
3969     return NVPTXISD::Suld2DV4I16Zero;
3970   case Intrinsic::nvvm_suld_2d_v4i32_zero:
3971     return NVPTXISD::Suld2DV4I32Zero;
3972   case Intrinsic::nvvm_suld_2d_array_i8_zero:
3973     return NVPTXISD::Suld2DArrayI8Zero;
3974   case Intrinsic::nvvm_suld_2d_array_i16_zero:
3975     return NVPTXISD::Suld2DArrayI16Zero;
3976   case Intrinsic::nvvm_suld_2d_array_i32_zero:
3977     return NVPTXISD::Suld2DArrayI32Zero;
3978   case Intrinsic::nvvm_suld_2d_array_i64_zero:
3979     return NVPTXISD::Suld2DArrayI64Zero;
3980   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3981     return NVPTXISD::Suld2DArrayV2I8Zero;
3982   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3983     return NVPTXISD::Suld2DArrayV2I16Zero;
3984   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3985     return NVPTXISD::Suld2DArrayV2I32Zero;
3986   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3987     return NVPTXISD::Suld2DArrayV2I64Zero;
3988   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3989     return NVPTXISD::Suld2DArrayV4I8Zero;
3990   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3991     return NVPTXISD::Suld2DArrayV4I16Zero;
3992   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3993     return NVPTXISD::Suld2DArrayV4I32Zero;
3994   case Intrinsic::nvvm_suld_3d_i8_zero:
3995     return NVPTXISD::Suld3DI8Zero;
3996   case Intrinsic::nvvm_suld_3d_i16_zero:
3997     return NVPTXISD::Suld3DI16Zero;
3998   case Intrinsic::nvvm_suld_3d_i32_zero:
3999     return NVPTXISD::Suld3DI32Zero;
4000   case Intrinsic::nvvm_suld_3d_i64_zero:
4001     return NVPTXISD::Suld3DI64Zero;
4002   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4003     return NVPTXISD::Suld3DV2I8Zero;
4004   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4005     return NVPTXISD::Suld3DV2I16Zero;
4006   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4007     return NVPTXISD::Suld3DV2I32Zero;
4008   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4009     return NVPTXISD::Suld3DV2I64Zero;
4010   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4011     return NVPTXISD::Suld3DV4I8Zero;
4012   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4013     return NVPTXISD::Suld3DV4I16Zero;
4014   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4015     return NVPTXISD::Suld3DV4I32Zero;
4016   }
4017 }
4018 
4019 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4020 // TgtMemIntrinsic
4021 // because we need the information that is only available in the "Value" type
4022 // of destination
4023 // pointer. In particular, the address space information.
4024 bool NVPTXTargetLowering::getTgtMemIntrinsic(
4025     IntrinsicInfo &Info, const CallInst &I,
4026     MachineFunction &MF, unsigned Intrinsic) const {
4027   switch (Intrinsic) {
4028   default:
4029     return false;
4030   case Intrinsic::nvvm_match_all_sync_i32p:
4031   case Intrinsic::nvvm_match_all_sync_i64p:
4032     Info.opc = ISD::INTRINSIC_W_CHAIN;
4033     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4034     // in order to model data exchange with other threads, but perform no real
4035     // memory accesses.
4036     Info.memVT = MVT::i1;
4037 
4038     // Our result depends on both our and other thread's arguments.
4039     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4040     return true;
4041   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4042   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4043   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4044   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4045   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4046   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4047   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4048   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4049   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4050   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4051   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4052   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4053   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4054   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4055   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4056   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4057   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4058   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4059   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4060   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4061   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4062   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4063   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4064   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4065     Info.opc = ISD::INTRINSIC_W_CHAIN;
4066     Info.memVT = MVT::v8f16;
4067     Info.ptrVal = I.getArgOperand(0);
4068     Info.offset = 0;
4069     Info.flags = MachineMemOperand::MOLoad;
4070     Info.align = Align(16);
4071     return true;
4072   }
4073   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4074   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4075   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4076   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4077   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4078   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4079   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4080   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4081   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4082   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4083   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4084   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4085   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4086   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4087   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4088   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4089   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4090   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4091   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4092   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4093   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4094   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4095   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4096   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4097     Info.opc = ISD::INTRINSIC_W_CHAIN;
4098     Info.memVT = MVT::v2i32;
4099     Info.ptrVal = I.getArgOperand(0);
4100     Info.offset = 0;
4101     Info.flags = MachineMemOperand::MOLoad;
4102     Info.align = Align(8);
4103     return true;
4104   }
4105 
4106   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4107   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4108   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4109   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4110   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4111   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4112   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4113   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4114   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4115   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4116   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4117   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4118   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4119   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4120   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4121   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4122 
4123   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4124   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4125   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4126   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4127   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4128   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4129   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4130   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4131   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4132   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4133   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4134   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4135   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4136   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4137   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4138   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4139   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4140   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4141     Info.opc = ISD::INTRINSIC_W_CHAIN;
4142     Info.memVT = MVT::v4i32;
4143     Info.ptrVal = I.getArgOperand(0);
4144     Info.offset = 0;
4145     Info.flags = MachineMemOperand::MOLoad;
4146     Info.align = Align(16);
4147     return true;
4148   }
4149 
4150   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4151   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4152   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4153   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4154   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4155   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4156   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4157   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4158 
4159   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4160   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4161   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4162   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4163   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4164   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4165   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4166   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4167   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4168   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4169   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4170   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4171   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4172   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4173   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4174   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4175   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4176   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4177   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4178   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4179   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4180   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4181     Info.opc = ISD::INTRINSIC_W_CHAIN;
4182     Info.memVT = MVT::i32;
4183     Info.ptrVal = I.getArgOperand(0);
4184     Info.offset = 0;
4185     Info.flags = MachineMemOperand::MOLoad;
4186     Info.align = Align(4);
4187     return true;
4188   }
4189 
4190   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4191   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4192   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4193   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4194   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4195   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4196   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4197   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4198   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4199   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4200   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4201   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4202     Info.opc = ISD::INTRINSIC_W_CHAIN;
4203     Info.memVT = MVT::v4f16;
4204     Info.ptrVal = I.getArgOperand(0);
4205     Info.offset = 0;
4206     Info.flags = MachineMemOperand::MOLoad;
4207     Info.align = Align(16);
4208     return true;
4209   }
4210 
4211   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4212   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4213   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4214   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4215   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4216   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4217   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4218   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4219   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4220   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4221   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4222   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4223   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4224   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4225   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4226   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4227     Info.opc = ISD::INTRINSIC_W_CHAIN;
4228     Info.memVT = MVT::v8f32;
4229     Info.ptrVal = I.getArgOperand(0);
4230     Info.offset = 0;
4231     Info.flags = MachineMemOperand::MOLoad;
4232     Info.align = Align(16);
4233     return true;
4234   }
4235 
4236   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4237   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4238   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4239   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4240 
4241   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4242   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4243   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4244   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4245 
4246   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4247   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4248   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4249   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4250   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4251   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4252   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4253   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4254   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4255   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4256   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4257   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4258     Info.opc = ISD::INTRINSIC_W_CHAIN;
4259     Info.memVT = MVT::v8i32;
4260     Info.ptrVal = I.getArgOperand(0);
4261     Info.offset = 0;
4262     Info.flags = MachineMemOperand::MOLoad;
4263     Info.align = Align(16);
4264     return true;
4265   }
4266 
4267   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4268   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4269   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4270   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4271   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4272   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4273   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4274   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4275   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4276   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4277     Info.opc = ISD::INTRINSIC_W_CHAIN;
4278     Info.memVT = MVT::v2i32;
4279     Info.ptrVal = I.getArgOperand(0);
4280     Info.offset = 0;
4281     Info.flags = MachineMemOperand::MOLoad;
4282     Info.align = Align(8);
4283     return true;
4284   }
4285 
4286   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4287   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4288   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4289   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4290 
4291   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4292   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4293   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4294   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4295     Info.opc = ISD::INTRINSIC_W_CHAIN;
4296     Info.memVT = MVT::f64;
4297     Info.ptrVal = I.getArgOperand(0);
4298     Info.offset = 0;
4299     Info.flags = MachineMemOperand::MOLoad;
4300     Info.align = Align(8);
4301     return true;
4302   }
4303 
4304   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4305   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4306   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4307   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4308     Info.opc = ISD::INTRINSIC_W_CHAIN;
4309     Info.memVT = MVT::v2f64;
4310     Info.ptrVal = I.getArgOperand(0);
4311     Info.offset = 0;
4312     Info.flags = MachineMemOperand::MOLoad;
4313     Info.align = Align(16);
4314     return true;
4315   }
4316 
4317   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4318   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4319   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4320   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4321   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4322   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4323   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4324   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4325   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4326   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4327   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4328   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4329     Info.opc = ISD::INTRINSIC_VOID;
4330     Info.memVT = MVT::v4f16;
4331     Info.ptrVal = I.getArgOperand(0);
4332     Info.offset = 0;
4333     Info.flags = MachineMemOperand::MOStore;
4334     Info.align = Align(16);
4335     return true;
4336   }
4337 
4338   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4339   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4340   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4341   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4342   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4343   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4344   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4345   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4346   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4347   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4348   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4349   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4350   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4351   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4352   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4353   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4354     Info.opc = ISD::INTRINSIC_VOID;
4355     Info.memVT = MVT::v8f32;
4356     Info.ptrVal = I.getArgOperand(0);
4357     Info.offset = 0;
4358     Info.flags = MachineMemOperand::MOStore;
4359     Info.align = Align(16);
4360     return true;
4361   }
4362 
4363   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4364   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4365   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4366   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4367   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4368   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4369   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4370   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4371   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4372   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4373   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4374   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4375     Info.opc = ISD::INTRINSIC_VOID;
4376     Info.memVT = MVT::v8i32;
4377     Info.ptrVal = I.getArgOperand(0);
4378     Info.offset = 0;
4379     Info.flags = MachineMemOperand::MOStore;
4380     Info.align = Align(16);
4381     return true;
4382   }
4383 
4384   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4385   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4386   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4387   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4388   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4389   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4390   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4391   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4392     Info.opc = ISD::INTRINSIC_VOID;
4393     Info.memVT = MVT::v2i32;
4394     Info.ptrVal = I.getArgOperand(0);
4395     Info.offset = 0;
4396     Info.flags = MachineMemOperand::MOStore;
4397     Info.align = Align(8);
4398     return true;
4399   }
4400 
4401   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4402   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4403   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4404   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4405     Info.opc = ISD::INTRINSIC_VOID;
4406     Info.memVT = MVT::v2f64;
4407     Info.ptrVal = I.getArgOperand(0);
4408     Info.offset = 0;
4409     Info.flags = MachineMemOperand::MOStore;
4410     Info.align = Align(16);
4411     return true;
4412   }
4413 
4414   case Intrinsic::nvvm_atomic_load_inc_32:
4415   case Intrinsic::nvvm_atomic_load_dec_32:
4416 
4417   case Intrinsic::nvvm_atomic_add_gen_f_cta:
4418   case Intrinsic::nvvm_atomic_add_gen_f_sys:
4419   case Intrinsic::nvvm_atomic_add_gen_i_cta:
4420   case Intrinsic::nvvm_atomic_add_gen_i_sys:
4421   case Intrinsic::nvvm_atomic_and_gen_i_cta:
4422   case Intrinsic::nvvm_atomic_and_gen_i_sys:
4423   case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4424   case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4425   case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4426   case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4427   case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4428   case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4429   case Intrinsic::nvvm_atomic_max_gen_i_cta:
4430   case Intrinsic::nvvm_atomic_max_gen_i_sys:
4431   case Intrinsic::nvvm_atomic_min_gen_i_cta:
4432   case Intrinsic::nvvm_atomic_min_gen_i_sys:
4433   case Intrinsic::nvvm_atomic_or_gen_i_cta:
4434   case Intrinsic::nvvm_atomic_or_gen_i_sys:
4435   case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4436   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4437   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4438   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4439     auto &DL = I.getModule()->getDataLayout();
4440     Info.opc = ISD::INTRINSIC_W_CHAIN;
4441     Info.memVT = getValueType(DL, I.getType());
4442     Info.ptrVal = I.getArgOperand(0);
4443     Info.offset = 0;
4444     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4445     Info.align.reset();
4446     return true;
4447   }
4448 
4449   case Intrinsic::nvvm_ldu_global_i:
4450   case Intrinsic::nvvm_ldu_global_f:
4451   case Intrinsic::nvvm_ldu_global_p: {
4452     auto &DL = I.getModule()->getDataLayout();
4453     Info.opc = ISD::INTRINSIC_W_CHAIN;
4454     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4455       Info.memVT = getValueType(DL, I.getType());
4456     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4457       Info.memVT = getPointerTy(DL);
4458     else
4459       Info.memVT = getValueType(DL, I.getType());
4460     Info.ptrVal = I.getArgOperand(0);
4461     Info.offset = 0;
4462     Info.flags = MachineMemOperand::MOLoad;
4463     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4464 
4465     return true;
4466   }
4467   case Intrinsic::nvvm_ldg_global_i:
4468   case Intrinsic::nvvm_ldg_global_f:
4469   case Intrinsic::nvvm_ldg_global_p: {
4470     auto &DL = I.getModule()->getDataLayout();
4471 
4472     Info.opc = ISD::INTRINSIC_W_CHAIN;
4473     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4474       Info.memVT = getValueType(DL, I.getType());
4475     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4476       Info.memVT = getPointerTy(DL);
4477     else
4478       Info.memVT = getValueType(DL, I.getType());
4479     Info.ptrVal = I.getArgOperand(0);
4480     Info.offset = 0;
4481     Info.flags = MachineMemOperand::MOLoad;
4482     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4483 
4484     return true;
4485   }
4486 
4487   case Intrinsic::nvvm_tex_1d_v4f32_s32:
4488   case Intrinsic::nvvm_tex_1d_v4f32_f32:
4489   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4490   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4491   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4492   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4493   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4494   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4495   case Intrinsic::nvvm_tex_2d_v4f32_s32:
4496   case Intrinsic::nvvm_tex_2d_v4f32_f32:
4497   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4498   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4499   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4500   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4501   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4502   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4503   case Intrinsic::nvvm_tex_3d_v4f32_s32:
4504   case Intrinsic::nvvm_tex_3d_v4f32_f32:
4505   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4506   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4507   case Intrinsic::nvvm_tex_cube_v4f32_f32:
4508   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4509   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4510   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4511   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4512   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4513   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4514   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4515   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4516   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4517   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4518   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4519   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4520   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4521   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4522   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4523   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4524   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4525   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4526   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4527   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4528   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4529   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4530   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4531   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4532   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4533   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4534   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4535   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4536   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4537   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4538   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4539   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4540   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4541   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4542   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4543     Info.opc = getOpcForTextureInstr(Intrinsic);
4544     Info.memVT = MVT::v4f32;
4545     Info.ptrVal = nullptr;
4546     Info.offset = 0;
4547     Info.flags = MachineMemOperand::MOLoad;
4548     Info.align = Align(16);
4549     return true;
4550 
4551   case Intrinsic::nvvm_tex_1d_v4s32_s32:
4552   case Intrinsic::nvvm_tex_1d_v4s32_f32:
4553   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4554   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4555   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4556   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4557   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4558   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4559   case Intrinsic::nvvm_tex_2d_v4s32_s32:
4560   case Intrinsic::nvvm_tex_2d_v4s32_f32:
4561   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4562   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4563   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4564   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4565   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4566   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4567   case Intrinsic::nvvm_tex_3d_v4s32_s32:
4568   case Intrinsic::nvvm_tex_3d_v4s32_f32:
4569   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4570   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4571   case Intrinsic::nvvm_tex_cube_v4s32_f32:
4572   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4573   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4574   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4575   case Intrinsic::nvvm_tex_cube_v4u32_f32:
4576   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4577   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4578   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4579   case Intrinsic::nvvm_tex_1d_v4u32_s32:
4580   case Intrinsic::nvvm_tex_1d_v4u32_f32:
4581   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4582   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4583   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4584   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4585   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4586   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4587   case Intrinsic::nvvm_tex_2d_v4u32_s32:
4588   case Intrinsic::nvvm_tex_2d_v4u32_f32:
4589   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4590   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4591   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4592   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4593   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4594   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4595   case Intrinsic::nvvm_tex_3d_v4u32_s32:
4596   case Intrinsic::nvvm_tex_3d_v4u32_f32:
4597   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4598   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4599   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4600   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4601   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4602   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4603   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4604   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4605   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4606   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4607   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4608   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4609   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4610   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4611   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4612   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4613   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4614   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4615   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4616   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4617   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4618   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4619   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4620   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4621   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4622   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4623   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4624   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4625   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4626   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4627   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4628   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4629   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4630   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4631   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4632   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4633   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4634   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4635   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4636   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4637   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4638   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4639   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4640   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4641   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4642   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4643   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4644   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4645   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4646   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4647   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4648   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4649   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4650   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4651   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4652   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4653   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4654   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4655   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4656   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4657   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4658   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4659   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4660   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4661   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4662   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4663     Info.opc = getOpcForTextureInstr(Intrinsic);
4664     Info.memVT = MVT::v4i32;
4665     Info.ptrVal = nullptr;
4666     Info.offset = 0;
4667     Info.flags = MachineMemOperand::MOLoad;
4668     Info.align = Align(16);
4669     return true;
4670 
4671   case Intrinsic::nvvm_suld_1d_i8_clamp:
4672   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4673   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4674   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4675   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4676   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4677   case Intrinsic::nvvm_suld_2d_i8_clamp:
4678   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4679   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4680   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4681   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4682   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4683   case Intrinsic::nvvm_suld_3d_i8_clamp:
4684   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4685   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4686   case Intrinsic::nvvm_suld_1d_i8_trap:
4687   case Intrinsic::nvvm_suld_1d_v2i8_trap:
4688   case Intrinsic::nvvm_suld_1d_v4i8_trap:
4689   case Intrinsic::nvvm_suld_1d_array_i8_trap:
4690   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4691   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4692   case Intrinsic::nvvm_suld_2d_i8_trap:
4693   case Intrinsic::nvvm_suld_2d_v2i8_trap:
4694   case Intrinsic::nvvm_suld_2d_v4i8_trap:
4695   case Intrinsic::nvvm_suld_2d_array_i8_trap:
4696   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4697   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4698   case Intrinsic::nvvm_suld_3d_i8_trap:
4699   case Intrinsic::nvvm_suld_3d_v2i8_trap:
4700   case Intrinsic::nvvm_suld_3d_v4i8_trap:
4701   case Intrinsic::nvvm_suld_1d_i8_zero:
4702   case Intrinsic::nvvm_suld_1d_v2i8_zero:
4703   case Intrinsic::nvvm_suld_1d_v4i8_zero:
4704   case Intrinsic::nvvm_suld_1d_array_i8_zero:
4705   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4706   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4707   case Intrinsic::nvvm_suld_2d_i8_zero:
4708   case Intrinsic::nvvm_suld_2d_v2i8_zero:
4709   case Intrinsic::nvvm_suld_2d_v4i8_zero:
4710   case Intrinsic::nvvm_suld_2d_array_i8_zero:
4711   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4712   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4713   case Intrinsic::nvvm_suld_3d_i8_zero:
4714   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4715   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4716     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4717     Info.memVT = MVT::i8;
4718     Info.ptrVal = nullptr;
4719     Info.offset = 0;
4720     Info.flags = MachineMemOperand::MOLoad;
4721     Info.align = Align(16);
4722     return true;
4723 
4724   case Intrinsic::nvvm_suld_1d_i16_clamp:
4725   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4726   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4727   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4728   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4729   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4730   case Intrinsic::nvvm_suld_2d_i16_clamp:
4731   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4732   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4733   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4734   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4735   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4736   case Intrinsic::nvvm_suld_3d_i16_clamp:
4737   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4738   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4739   case Intrinsic::nvvm_suld_1d_i16_trap:
4740   case Intrinsic::nvvm_suld_1d_v2i16_trap:
4741   case Intrinsic::nvvm_suld_1d_v4i16_trap:
4742   case Intrinsic::nvvm_suld_1d_array_i16_trap:
4743   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4744   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4745   case Intrinsic::nvvm_suld_2d_i16_trap:
4746   case Intrinsic::nvvm_suld_2d_v2i16_trap:
4747   case Intrinsic::nvvm_suld_2d_v4i16_trap:
4748   case Intrinsic::nvvm_suld_2d_array_i16_trap:
4749   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4750   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4751   case Intrinsic::nvvm_suld_3d_i16_trap:
4752   case Intrinsic::nvvm_suld_3d_v2i16_trap:
4753   case Intrinsic::nvvm_suld_3d_v4i16_trap:
4754   case Intrinsic::nvvm_suld_1d_i16_zero:
4755   case Intrinsic::nvvm_suld_1d_v2i16_zero:
4756   case Intrinsic::nvvm_suld_1d_v4i16_zero:
4757   case Intrinsic::nvvm_suld_1d_array_i16_zero:
4758   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4759   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4760   case Intrinsic::nvvm_suld_2d_i16_zero:
4761   case Intrinsic::nvvm_suld_2d_v2i16_zero:
4762   case Intrinsic::nvvm_suld_2d_v4i16_zero:
4763   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4764   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4765   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4766   case Intrinsic::nvvm_suld_3d_i16_zero:
4767   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4768   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4769     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4770     Info.memVT = MVT::i16;
4771     Info.ptrVal = nullptr;
4772     Info.offset = 0;
4773     Info.flags = MachineMemOperand::MOLoad;
4774     Info.align = Align(16);
4775     return true;
4776 
4777   case Intrinsic::nvvm_suld_1d_i32_clamp:
4778   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4779   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4780   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4781   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4782   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4783   case Intrinsic::nvvm_suld_2d_i32_clamp:
4784   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4785   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4786   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4787   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4788   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4789   case Intrinsic::nvvm_suld_3d_i32_clamp:
4790   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4791   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4792   case Intrinsic::nvvm_suld_1d_i32_trap:
4793   case Intrinsic::nvvm_suld_1d_v2i32_trap:
4794   case Intrinsic::nvvm_suld_1d_v4i32_trap:
4795   case Intrinsic::nvvm_suld_1d_array_i32_trap:
4796   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4797   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4798   case Intrinsic::nvvm_suld_2d_i32_trap:
4799   case Intrinsic::nvvm_suld_2d_v2i32_trap:
4800   case Intrinsic::nvvm_suld_2d_v4i32_trap:
4801   case Intrinsic::nvvm_suld_2d_array_i32_trap:
4802   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4803   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4804   case Intrinsic::nvvm_suld_3d_i32_trap:
4805   case Intrinsic::nvvm_suld_3d_v2i32_trap:
4806   case Intrinsic::nvvm_suld_3d_v4i32_trap:
4807   case Intrinsic::nvvm_suld_1d_i32_zero:
4808   case Intrinsic::nvvm_suld_1d_v2i32_zero:
4809   case Intrinsic::nvvm_suld_1d_v4i32_zero:
4810   case Intrinsic::nvvm_suld_1d_array_i32_zero:
4811   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4812   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4813   case Intrinsic::nvvm_suld_2d_i32_zero:
4814   case Intrinsic::nvvm_suld_2d_v2i32_zero:
4815   case Intrinsic::nvvm_suld_2d_v4i32_zero:
4816   case Intrinsic::nvvm_suld_2d_array_i32_zero:
4817   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4818   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4819   case Intrinsic::nvvm_suld_3d_i32_zero:
4820   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4821   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4822     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4823     Info.memVT = MVT::i32;
4824     Info.ptrVal = nullptr;
4825     Info.offset = 0;
4826     Info.flags = MachineMemOperand::MOLoad;
4827     Info.align = Align(16);
4828     return true;
4829 
4830   case Intrinsic::nvvm_suld_1d_i64_clamp:
4831   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4832   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4833   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4834   case Intrinsic::nvvm_suld_2d_i64_clamp:
4835   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4836   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4837   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4838   case Intrinsic::nvvm_suld_3d_i64_clamp:
4839   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4840   case Intrinsic::nvvm_suld_1d_i64_trap:
4841   case Intrinsic::nvvm_suld_1d_v2i64_trap:
4842   case Intrinsic::nvvm_suld_1d_array_i64_trap:
4843   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4844   case Intrinsic::nvvm_suld_2d_i64_trap:
4845   case Intrinsic::nvvm_suld_2d_v2i64_trap:
4846   case Intrinsic::nvvm_suld_2d_array_i64_trap:
4847   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4848   case Intrinsic::nvvm_suld_3d_i64_trap:
4849   case Intrinsic::nvvm_suld_3d_v2i64_trap:
4850   case Intrinsic::nvvm_suld_1d_i64_zero:
4851   case Intrinsic::nvvm_suld_1d_v2i64_zero:
4852   case Intrinsic::nvvm_suld_1d_array_i64_zero:
4853   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4854   case Intrinsic::nvvm_suld_2d_i64_zero:
4855   case Intrinsic::nvvm_suld_2d_v2i64_zero:
4856   case Intrinsic::nvvm_suld_2d_array_i64_zero:
4857   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4858   case Intrinsic::nvvm_suld_3d_i64_zero:
4859   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4860     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4861     Info.memVT = MVT::i64;
4862     Info.ptrVal = nullptr;
4863     Info.offset = 0;
4864     Info.flags = MachineMemOperand::MOLoad;
4865     Info.align = Align(16);
4866     return true;
4867   }
4868   return false;
4869 }
4870 
4871 /// getFunctionParamOptimizedAlign - since function arguments are passed via
4872 /// .param space, we may want to increase their alignment in a way that
4873 /// ensures that we can effectively vectorize their loads & stores. We can
4874 /// increase alignment only if the function has internal or has private
4875 /// linkage as for other linkage types callers may already rely on default
4876 /// alignment. To allow using 128-bit vectorized loads/stores, this function
4877 /// ensures that alignment is 16 or greater.
4878 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
4879     const Function *F, Type *ArgTy, const DataLayout &DL) const {
4880   const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
4881 
4882   // If a function has linkage different from internal or private, we
4883   // must use default ABI alignment as external users rely on it. Same
4884   // for a function that may be called from a function pointer.
4885   if (!F || !F->hasLocalLinkage() ||
4886       F->hasAddressTaken(/*Users=*/nullptr,
4887                          /*IgnoreCallbackUses=*/false,
4888                          /*IgnoreAssumeLikeCalls=*/true,
4889                          /*IgnoreLLVMUsed=*/true))
4890     return Align(ABITypeAlign);
4891 
4892   assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4893   return Align(std::max(uint64_t(16), ABITypeAlign));
4894 }
4895 
4896 /// Helper for computing alignment of a device function byval parameter.
4897 Align NVPTXTargetLowering::getFunctionByValParamAlign(
4898     const Function *F, Type *ArgTy, Align InitialAlign,
4899     const DataLayout &DL) const {
4900   Align ArgAlign = InitialAlign;
4901   // Try to increase alignment to enhance vectorization options.
4902   if (F)
4903     ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4904 
4905   // Old ptx versions have a bug. When PTX code takes address of
4906   // byval parameter with alignment < 4, ptxas generates code to
4907   // spill argument into memory. Alas on sm_50+ ptxas generates
4908   // SASS code that fails with misaligned access. To work around
4909   // the problem, make sure that we align byval parameters by at
4910   // least 4. This bug seems to be fixed at least starting from
4911   // ptxas > 9.0.
4912   // TODO: remove this after verifying the bug is not reproduced
4913   // on non-deprecated ptxas versions.
4914   if (ForceMinByValParamAlign)
4915     ArgAlign = std::max(ArgAlign, Align(4));
4916 
4917   return ArgAlign;
4918 }
4919 
4920 // Helper for getting a function parameter name. Name is composed from
4921 // its index and the function name. Negative index corresponds to special
4922 // parameter (unsized array) used for passing variable arguments.
4923 std::string NVPTXTargetLowering::getParamName(const Function *F,
4924                                               int Idx) const {
4925   std::string ParamName;
4926   raw_string_ostream ParamStr(ParamName);
4927 
4928   ParamStr << getTargetMachine().getSymbol(F)->getName();
4929   if (Idx < 0)
4930     ParamStr << "_vararg";
4931   else
4932     ParamStr << "_param_" << Idx;
4933 
4934   return ParamName;
4935 }
4936 
4937 /// isLegalAddressingMode - Return true if the addressing mode represented
4938 /// by AM is legal for this target, for a load/store of the specified type.
4939 /// Used to guide target specific optimizations, like loop strength reduction
4940 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4941 /// (CodeGenPrepare.cpp)
4942 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4943                                                 const AddrMode &AM, Type *Ty,
4944                                                 unsigned AS, Instruction *I) const {
4945   // AddrMode - This represents an addressing mode of:
4946   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4947   //
4948   // The legal address modes are
4949   // - [avar]
4950   // - [areg]
4951   // - [areg+immoff]
4952   // - [immAddr]
4953 
4954   if (AM.BaseGV) {
4955     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4956   }
4957 
4958   switch (AM.Scale) {
4959   case 0: // "r", "r+i" or "i" is allowed
4960     break;
4961   case 1:
4962     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4963       return false;
4964     // Otherwise we have r+i.
4965     break;
4966   default:
4967     // No scale > 1 is allowed
4968     return false;
4969   }
4970   return true;
4971 }
4972 
4973 //===----------------------------------------------------------------------===//
4974 //                         NVPTX Inline Assembly Support
4975 //===----------------------------------------------------------------------===//
4976 
4977 /// getConstraintType - Given a constraint letter, return the type of
4978 /// constraint it is for this target.
4979 NVPTXTargetLowering::ConstraintType
4980 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
4981   if (Constraint.size() == 1) {
4982     switch (Constraint[0]) {
4983     default:
4984       break;
4985     case 'b':
4986     case 'r':
4987     case 'h':
4988     case 'c':
4989     case 'l':
4990     case 'f':
4991     case 'd':
4992     case '0':
4993     case 'N':
4994       return C_RegisterClass;
4995     }
4996   }
4997   return TargetLowering::getConstraintType(Constraint);
4998 }
4999 
5000 std::pair<unsigned, const TargetRegisterClass *>
5001 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5002                                                   StringRef Constraint,
5003                                                   MVT VT) const {
5004   if (Constraint.size() == 1) {
5005     switch (Constraint[0]) {
5006     case 'b':
5007       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5008     case 'c':
5009       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5010     case 'h':
5011       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5012     case 'r':
5013       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5014     case 'l':
5015     case 'N':
5016       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5017     case 'f':
5018       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5019     case 'd':
5020       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5021     }
5022   }
5023   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5024 }
5025 
5026 //===----------------------------------------------------------------------===//
5027 //                         NVPTX DAG Combining
5028 //===----------------------------------------------------------------------===//
5029 
5030 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5031                                    CodeGenOptLevel OptLevel) const {
5032   // Always honor command-line argument
5033   if (FMAContractLevelOpt.getNumOccurrences() > 0)
5034     return FMAContractLevelOpt > 0;
5035 
5036   // Do not contract if we're not optimizing the code.
5037   if (OptLevel == CodeGenOptLevel::None)
5038     return false;
5039 
5040   // Honor TargetOptions flags that explicitly say fusion is okay.
5041   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5042     return true;
5043 
5044   return allowUnsafeFPMath(MF);
5045 }
5046 
5047 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5048   // Honor TargetOptions flags that explicitly say unsafe math is okay.
5049   if (MF.getTarget().Options.UnsafeFPMath)
5050     return true;
5051 
5052   // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5053   const Function &F = MF.getFunction();
5054   return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5055 }
5056 
5057 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5058 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
5059 /// called with the default operands, and if that fails, with commuted
5060 /// operands.
5061 static SDValue PerformADDCombineWithOperands(
5062     SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
5063     const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5064   SelectionDAG  &DAG = DCI.DAG;
5065   // Skip non-integer, non-scalar case
5066   EVT VT=N0.getValueType();
5067   if (VT.isVector())
5068     return SDValue();
5069 
5070   // fold (add (mul a, b), c) -> (mad a, b, c)
5071   //
5072   if (N0.getOpcode() == ISD::MUL) {
5073     assert (VT.isInteger());
5074     // For integer:
5075     // Since integer multiply-add costs the same as integer multiply
5076     // but is more costly than integer add, do the fusion only when
5077     // the mul is only used in the add.
5078     if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5079         !N0.getNode()->hasOneUse())
5080       return SDValue();
5081 
5082     // Do the folding
5083     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5084                        N0.getOperand(0), N0.getOperand(1), N1);
5085   }
5086   else if (N0.getOpcode() == ISD::FMUL) {
5087     if (VT == MVT::f32 || VT == MVT::f64) {
5088       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5089           &DAG.getTargetLoweringInfo());
5090       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
5091         return SDValue();
5092 
5093       // For floating point:
5094       // Do the fusion only when the mul has less than 5 uses and all
5095       // are add.
5096       // The heuristic is that if a use is not an add, then that use
5097       // cannot be fused into fma, therefore mul is still needed anyway.
5098       // If there are more than 4 uses, even if they are all add, fusing
5099       // them will increase register pressue.
5100       //
5101       int numUses = 0;
5102       int nonAddCount = 0;
5103       for (const SDNode *User : N0.getNode()->uses()) {
5104         numUses++;
5105         if (User->getOpcode() != ISD::FADD)
5106           ++nonAddCount;
5107       }
5108       if (numUses >= 5)
5109         return SDValue();
5110       if (nonAddCount) {
5111         int orderNo = N->getIROrder();
5112         int orderNo2 = N0.getNode()->getIROrder();
5113         // simple heuristics here for considering potential register
5114         // pressure, the logics here is that the differnce are used
5115         // to measure the distance between def and use, the longer distance
5116         // more likely cause register pressure.
5117         if (orderNo - orderNo2 < 500)
5118           return SDValue();
5119 
5120         // Now, check if at least one of the FMUL's operands is live beyond the node N,
5121         // which guarantees that the FMA will not increase register pressure at node N.
5122         bool opIsLive = false;
5123         const SDNode *left = N0.getOperand(0).getNode();
5124         const SDNode *right = N0.getOperand(1).getNode();
5125 
5126         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5127           opIsLive = true;
5128 
5129         if (!opIsLive)
5130           for (const SDNode *User : left->uses()) {
5131             int orderNo3 = User->getIROrder();
5132             if (orderNo3 > orderNo) {
5133               opIsLive = true;
5134               break;
5135             }
5136           }
5137 
5138         if (!opIsLive)
5139           for (const SDNode *User : right->uses()) {
5140             int orderNo3 = User->getIROrder();
5141             if (orderNo3 > orderNo) {
5142               opIsLive = true;
5143               break;
5144             }
5145           }
5146 
5147         if (!opIsLive)
5148           return SDValue();
5149       }
5150 
5151       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
5152                          N0.getOperand(0), N0.getOperand(1), N1);
5153     }
5154   }
5155 
5156   return SDValue();
5157 }
5158 
5159 static SDValue PerformStoreRetvalCombine(SDNode *N) {
5160   // Operands from the 2nd to the last one are the values to be stored
5161   for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5162     if (!N->getOperand(I).isUndef())
5163       return SDValue();
5164 
5165   // Operand 0 is the previous value in the chain. Cannot return EntryToken
5166   // as the previous value will become unused and eliminated later.
5167   return N->getOperand(0);
5168 }
5169 
5170 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5171 ///
5172 static SDValue PerformADDCombine(SDNode *N,
5173                                  TargetLowering::DAGCombinerInfo &DCI,
5174                                  const NVPTXSubtarget &Subtarget,
5175                                  CodeGenOptLevel OptLevel) {
5176   SDValue N0 = N->getOperand(0);
5177   SDValue N1 = N->getOperand(1);
5178 
5179   // First try with the default operand order.
5180   if (SDValue Result =
5181           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5182     return Result;
5183 
5184   // If that didn't work, try again with the operands commuted.
5185   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
5186 }
5187 
5188 static SDValue PerformANDCombine(SDNode *N,
5189                                  TargetLowering::DAGCombinerInfo &DCI) {
5190   // The type legalizer turns a vector load of i8 values into a zextload to i16
5191   // registers, optionally ANY_EXTENDs it (if target type is integer),
5192   // and ANDs off the high 8 bits. Since we turn this load into a
5193   // target-specific DAG node, the DAG combiner fails to eliminate these AND
5194   // nodes. Do that here.
5195   SDValue Val = N->getOperand(0);
5196   SDValue Mask = N->getOperand(1);
5197 
5198   if (isa<ConstantSDNode>(Val)) {
5199     std::swap(Val, Mask);
5200   }
5201 
5202   SDValue AExt;
5203 
5204   // Convert BFE-> truncate i16 -> and 255
5205   // To just BFE-> truncate i16, as the value already has all the bits in the
5206   // right places.
5207   if (Val.getOpcode() == ISD::TRUNCATE) {
5208     SDValue BFE = Val.getOperand(0);
5209     if (BFE.getOpcode() != NVPTXISD::BFE)
5210       return SDValue();
5211 
5212     ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5213     if (!BFEBits)
5214       return SDValue();
5215     uint64_t BFEBitsVal = BFEBits->getZExtValue();
5216 
5217     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5218     if (!MaskCnst) {
5219       // Not an AND with a constant
5220       return SDValue();
5221     }
5222     uint64_t MaskVal = MaskCnst->getZExtValue();
5223 
5224     if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5225       return SDValue();
5226     // If we get here, the AND is unnecessary.  Just replace it with the trunc
5227     DCI.CombineTo(N, Val, false);
5228   }
5229   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5230   if (Val.getOpcode() == ISD::ANY_EXTEND) {
5231     AExt = Val;
5232     Val = Val->getOperand(0);
5233   }
5234 
5235   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5236     Val = Val->getOperand(0);
5237   }
5238 
5239   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5240       Val->getOpcode() == NVPTXISD::LoadV4) {
5241     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5242     if (!MaskCnst) {
5243       // Not an AND with a constant
5244       return SDValue();
5245     }
5246 
5247     uint64_t MaskVal = MaskCnst->getZExtValue();
5248     if (MaskVal != 0xff) {
5249       // Not an AND that chops off top 8 bits
5250       return SDValue();
5251     }
5252 
5253     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5254     if (!Mem) {
5255       // Not a MemSDNode?!?
5256       return SDValue();
5257     }
5258 
5259     EVT MemVT = Mem->getMemoryVT();
5260     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5261       // We only handle the i8 case
5262       return SDValue();
5263     }
5264 
5265     unsigned ExtType =
5266       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
5267         getZExtValue();
5268     if (ExtType == ISD::SEXTLOAD) {
5269       // If for some reason the load is a sextload, the and is needed to zero
5270       // out the high 8 bits
5271       return SDValue();
5272     }
5273 
5274     bool AddTo = false;
5275     if (AExt.getNode() != nullptr) {
5276       // Re-insert the ext as a zext.
5277       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5278                             AExt.getValueType(), Val);
5279       AddTo = true;
5280     }
5281 
5282     // If we get here, the AND is unnecessary.  Just replace it with the load
5283     DCI.CombineTo(N, Val, AddTo);
5284   }
5285 
5286   return SDValue();
5287 }
5288 
5289 static SDValue PerformREMCombine(SDNode *N,
5290                                  TargetLowering::DAGCombinerInfo &DCI,
5291                                  CodeGenOptLevel OptLevel) {
5292   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5293 
5294   // Don't do anything at less than -O2.
5295   if (OptLevel < CodeGenOptLevel::Default)
5296     return SDValue();
5297 
5298   SelectionDAG &DAG = DCI.DAG;
5299   SDLoc DL(N);
5300   EVT VT = N->getValueType(0);
5301   bool IsSigned = N->getOpcode() == ISD::SREM;
5302   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5303 
5304   const SDValue &Num = N->getOperand(0);
5305   const SDValue &Den = N->getOperand(1);
5306 
5307   for (const SDNode *U : Num->uses()) {
5308     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5309         U->getOperand(1) == Den) {
5310       // Num % Den -> Num - (Num / Den) * Den
5311       return DAG.getNode(ISD::SUB, DL, VT, Num,
5312                          DAG.getNode(ISD::MUL, DL, VT,
5313                                      DAG.getNode(DivOpc, DL, VT, Num, Den),
5314                                      Den));
5315     }
5316   }
5317   return SDValue();
5318 }
5319 
5320 enum OperandSignedness {
5321   Signed = 0,
5322   Unsigned,
5323   Unknown
5324 };
5325 
5326 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5327 /// that can be demoted to \p OptSize bits without loss of information. The
5328 /// signedness of the operand, if determinable, is placed in \p S.
5329 static bool IsMulWideOperandDemotable(SDValue Op,
5330                                       unsigned OptSize,
5331                                       OperandSignedness &S) {
5332   S = Unknown;
5333 
5334   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5335       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5336     EVT OrigVT = Op.getOperand(0).getValueType();
5337     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5338       S = Signed;
5339       return true;
5340     }
5341   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5342     EVT OrigVT = Op.getOperand(0).getValueType();
5343     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5344       S = Unsigned;
5345       return true;
5346     }
5347   }
5348 
5349   return false;
5350 }
5351 
5352 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5353 /// be demoted to \p OptSize bits without loss of information. If the operands
5354 /// contain a constant, it should appear as the RHS operand. The signedness of
5355 /// the operands is placed in \p IsSigned.
5356 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5357                                         unsigned OptSize,
5358                                         bool &IsSigned) {
5359   OperandSignedness LHSSign;
5360 
5361   // The LHS operand must be a demotable op
5362   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5363     return false;
5364 
5365   // We should have been able to determine the signedness from the LHS
5366   if (LHSSign == Unknown)
5367     return false;
5368 
5369   IsSigned = (LHSSign == Signed);
5370 
5371   // The RHS can be a demotable op or a constant
5372   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5373     const APInt &Val = CI->getAPIntValue();
5374     if (LHSSign == Unsigned) {
5375       return Val.isIntN(OptSize);
5376     } else {
5377       return Val.isSignedIntN(OptSize);
5378     }
5379   } else {
5380     OperandSignedness RHSSign;
5381     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5382       return false;
5383 
5384     return LHSSign == RHSSign;
5385   }
5386 }
5387 
5388 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5389 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5390 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5391 /// amount.
5392 static SDValue TryMULWIDECombine(SDNode *N,
5393                                  TargetLowering::DAGCombinerInfo &DCI) {
5394   EVT MulType = N->getValueType(0);
5395   if (MulType != MVT::i32 && MulType != MVT::i64) {
5396     return SDValue();
5397   }
5398 
5399   SDLoc DL(N);
5400   unsigned OptSize = MulType.getSizeInBits() >> 1;
5401   SDValue LHS = N->getOperand(0);
5402   SDValue RHS = N->getOperand(1);
5403 
5404   // Canonicalize the multiply so the constant (if any) is on the right
5405   if (N->getOpcode() == ISD::MUL) {
5406     if (isa<ConstantSDNode>(LHS)) {
5407       std::swap(LHS, RHS);
5408     }
5409   }
5410 
5411   // If we have a SHL, determine the actual multiply amount
5412   if (N->getOpcode() == ISD::SHL) {
5413     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5414     if (!ShlRHS) {
5415       return SDValue();
5416     }
5417 
5418     APInt ShiftAmt = ShlRHS->getAPIntValue();
5419     unsigned BitWidth = MulType.getSizeInBits();
5420     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5421       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5422       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5423     } else {
5424       return SDValue();
5425     }
5426   }
5427 
5428   bool Signed;
5429   // Verify that our operands are demotable
5430   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5431     return SDValue();
5432   }
5433 
5434   EVT DemotedVT;
5435   if (MulType == MVT::i32) {
5436     DemotedVT = MVT::i16;
5437   } else {
5438     DemotedVT = MVT::i32;
5439   }
5440 
5441   // Truncate the operands to the correct size. Note that these are just for
5442   // type consistency and will (likely) be eliminated in later phases.
5443   SDValue TruncLHS =
5444     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5445   SDValue TruncRHS =
5446     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5447 
5448   unsigned Opc;
5449   if (Signed) {
5450     Opc = NVPTXISD::MUL_WIDE_SIGNED;
5451   } else {
5452     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5453   }
5454 
5455   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5456 }
5457 
5458 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5459 static SDValue PerformMULCombine(SDNode *N,
5460                                  TargetLowering::DAGCombinerInfo &DCI,
5461                                  CodeGenOptLevel OptLevel) {
5462   if (OptLevel > CodeGenOptLevel::None) {
5463     // Try mul.wide combining at OptLevel > 0
5464     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5465       return Ret;
5466   }
5467 
5468   return SDValue();
5469 }
5470 
5471 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5472 static SDValue PerformSHLCombine(SDNode *N,
5473                                  TargetLowering::DAGCombinerInfo &DCI,
5474                                  CodeGenOptLevel OptLevel) {
5475   if (OptLevel > CodeGenOptLevel::None) {
5476     // Try mul.wide combining at OptLevel > 0
5477     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5478       return Ret;
5479   }
5480 
5481   return SDValue();
5482 }
5483 
5484 static SDValue PerformSETCCCombine(SDNode *N,
5485                                    TargetLowering::DAGCombinerInfo &DCI,
5486                                    unsigned int SmVersion) {
5487   EVT CCType = N->getValueType(0);
5488   SDValue A = N->getOperand(0);
5489   SDValue B = N->getOperand(1);
5490 
5491   EVT AType = A.getValueType();
5492   if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5493     return SDValue();
5494 
5495   if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5496     return SDValue();
5497 
5498   SDLoc DL(N);
5499   // setp.f16x2 returns two scalar predicates, which we need to
5500   // convert back to v2i1. The returned result will be scalarized by
5501   // the legalizer, but the comparison will remain a single vector
5502   // instruction.
5503   SDValue CCNode = DCI.DAG.getNode(
5504       A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5505                                      : NVPTXISD::SETP_BF16X2,
5506       DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5507   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5508                          CCNode.getValue(1));
5509 }
5510 
5511 static SDValue PerformEXTRACTCombine(SDNode *N,
5512                                      TargetLowering::DAGCombinerInfo &DCI) {
5513   SDValue Vector = N->getOperand(0);
5514   SDLoc DL(N);
5515   EVT VectorVT = Vector.getValueType();
5516   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5517       IsPTXVectorType(VectorVT.getSimpleVT()))
5518     return SDValue(); // Native vector loads already combine nicely w/
5519                       // extract_vector_elt, except for v4i8.
5520   // Don't mess with singletons or v2*16 types, we already handle them OK.
5521   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5522       VectorVT == MVT::v4i8)
5523     return SDValue();
5524 
5525   uint64_t VectorBits = VectorVT.getSizeInBits();
5526   // We only handle the types we can extract in-register.
5527   if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5528     return SDValue();
5529 
5530   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5531   // Index == 0 is handled by generic DAG combiner.
5532   if (!Index || Index->getZExtValue() == 0)
5533     return SDValue();
5534 
5535   MVT IVT = MVT::getIntegerVT(VectorBits);
5536   EVT EltVT = VectorVT.getVectorElementType();
5537   EVT EltIVT = EltVT.changeTypeToInteger();
5538   uint64_t EltBits = EltVT.getScalarSizeInBits();
5539 
5540   SDValue Result = DCI.DAG.getNode(
5541       ISD::TRUNCATE, DL, EltIVT,
5542       DCI.DAG.getNode(
5543           ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5544           DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5545 
5546   // If element has non-integer type, bitcast it back to the expected type.
5547   if (EltVT != EltIVT)
5548     Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5549   // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5550   if (EltVT != N->getValueType(0))
5551     Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5552 
5553   return Result;
5554 }
5555 
5556 static SDValue PerformVSELECTCombine(SDNode *N,
5557                                      TargetLowering::DAGCombinerInfo &DCI) {
5558   SDValue VA = N->getOperand(1);
5559   EVT VectorVT = VA.getValueType();
5560   if (VectorVT != MVT::v4i8)
5561     return SDValue();
5562 
5563   // We need to split vselect into individual per-element operations Because we
5564   // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5565   // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5566   // to/from i16 normally used for i8 values.
5567   SmallVector<SDValue, 4> E;
5568   SDLoc DL(N);
5569   SDValue VCond = N->getOperand(0);
5570   SDValue VB = N->getOperand(2);
5571   for (int I = 0; I < 4; ++I) {
5572     SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5573                                 DCI.DAG.getConstant(I, DL, MVT::i32));
5574     SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5575         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5576                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5577         DL, MVT::i32);
5578     SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5579         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5580                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5581         DL, MVT::i32);
5582     E.push_back(DCI.DAG.getAnyExtOrTrunc(
5583         DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5584   }
5585   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5586 }
5587 
5588 static SDValue PerformLOADCombine(SDNode *N,
5589                                   TargetLowering::DAGCombinerInfo &DCI) {
5590   SelectionDAG &DAG = DCI.DAG;
5591   LoadSDNode *LD = cast<LoadSDNode>(N);
5592 
5593   // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5594   // letting ReplaceLoadVector split it into smaller loads during legalization.
5595   // This is done at dag-combine1 time, so that vector operations with i8
5596   // elements can be optimised away instead of being needlessly split during
5597   // legalization, which involves storing to the stack and loading it back.
5598   EVT VT = N->getValueType(0);
5599   if (VT != MVT::v16i8)
5600     return SDValue();
5601 
5602   SDLoc DL(N);
5603 
5604   // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5605   unsigned Opc = NVPTXISD::LoadV4;
5606   EVT NewVT = MVT::v4i32;
5607   EVT EltVT = NewVT.getVectorElementType();
5608   unsigned NumElts = NewVT.getVectorNumElements();
5609   EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5610   SDVTList RetVTList = DAG.getVTList(RetVTs);
5611   SmallVector<SDValue, 8> Ops(N->ops());
5612   Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5613   SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5614                                             LD->getMemOperand());
5615   SDValue NewChain = NewLoad.getValue(NumElts);
5616 
5617   // Create a vector of the same type returned by the original load.
5618   SmallVector<SDValue, 4> Elts;
5619   for (unsigned i = 0; i < NumElts; i++)
5620     Elts.push_back(NewLoad.getValue(i));
5621   return DCI.DAG.getMergeValues(
5622       {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5623        NewChain},
5624       DL);
5625 }
5626 
5627 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5628                                                DAGCombinerInfo &DCI) const {
5629   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
5630   switch (N->getOpcode()) {
5631     default: break;
5632     case ISD::ADD:
5633     case ISD::FADD:
5634       return PerformADDCombine(N, DCI, STI, OptLevel);
5635     case ISD::MUL:
5636       return PerformMULCombine(N, DCI, OptLevel);
5637     case ISD::SHL:
5638       return PerformSHLCombine(N, DCI, OptLevel);
5639     case ISD::AND:
5640       return PerformANDCombine(N, DCI);
5641     case ISD::UREM:
5642     case ISD::SREM:
5643       return PerformREMCombine(N, DCI, OptLevel);
5644     case ISD::SETCC:
5645       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5646     case ISD::LOAD:
5647       return PerformLOADCombine(N, DCI);
5648     case NVPTXISD::StoreRetval:
5649     case NVPTXISD::StoreRetvalV2:
5650     case NVPTXISD::StoreRetvalV4:
5651       return PerformStoreRetvalCombine(N);
5652     case ISD::EXTRACT_VECTOR_ELT:
5653       return PerformEXTRACTCombine(N, DCI);
5654     case ISD::VSELECT:
5655       return PerformVSELECTCombine(N, DCI);
5656   }
5657   return SDValue();
5658 }
5659 
5660 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5661 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
5662                               SmallVectorImpl<SDValue> &Results) {
5663   EVT ResVT = N->getValueType(0);
5664   SDLoc DL(N);
5665 
5666   assert(ResVT.isVector() && "Vector load must have vector type");
5667 
5668   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5669   // legal.  We can (and should) split that into 2 loads of <2 x double> here
5670   // but I'm leaving that as a TODO for now.
5671   assert(ResVT.isSimple() && "Can only handle simple types");
5672   switch (ResVT.getSimpleVT().SimpleTy) {
5673   default:
5674     return;
5675   case MVT::v2i8:
5676   case MVT::v2i16:
5677   case MVT::v2i32:
5678   case MVT::v2i64:
5679   case MVT::v2f16:
5680   case MVT::v2f32:
5681   case MVT::v2f64:
5682   case MVT::v4i8:
5683   case MVT::v4i16:
5684   case MVT::v4i32:
5685   case MVT::v4f16:
5686   case MVT::v4f32:
5687   case MVT::v8f16:  // <4 x f16x2>
5688   case MVT::v8bf16: // <4 x bf16x2>
5689   case MVT::v8i16:  // <4 x i16x2>
5690     // This is a "native" vector type
5691     break;
5692   }
5693 
5694   LoadSDNode *LD = cast<LoadSDNode>(N);
5695 
5696   Align Alignment = LD->getAlign();
5697   auto &TD = DAG.getDataLayout();
5698   Align PrefAlign =
5699       TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5700   if (Alignment < PrefAlign) {
5701     // This load is not sufficiently aligned, so bail out and let this vector
5702     // load be scalarized.  Note that we may still be able to emit smaller
5703     // vector loads.  For example, if we are loading a <4 x float> with an
5704     // alignment of 8, this check will fail but the legalizer will try again
5705     // with 2 x <2 x float>, which will succeed with an alignment of 8.
5706     return;
5707   }
5708 
5709   EVT EltVT = ResVT.getVectorElementType();
5710   unsigned NumElts = ResVT.getVectorNumElements();
5711 
5712   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5713   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5714   // loaded type to i16 and propagate the "real" type as the memory type.
5715   bool NeedTrunc = false;
5716   if (EltVT.getSizeInBits() < 16) {
5717     EltVT = MVT::i16;
5718     NeedTrunc = true;
5719   }
5720 
5721   unsigned Opcode = 0;
5722   SDVTList LdResVTs;
5723   bool Load16x2 = false;
5724 
5725   switch (NumElts) {
5726   default:
5727     return;
5728   case 2:
5729     Opcode = NVPTXISD::LoadV2;
5730     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5731     break;
5732   case 4: {
5733     Opcode = NVPTXISD::LoadV4;
5734     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5735     LdResVTs = DAG.getVTList(ListVTs);
5736     break;
5737   }
5738   case 8: {
5739     // v8f16 is a special case. PTX doesn't have ld.v8.f16
5740     // instruction. Instead, we split the vector into v2f16 chunks and
5741     // load them with ld.v4.b32.
5742     assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5743     Load16x2 = true;
5744     Opcode = NVPTXISD::LoadV4;
5745     EVT VVT;
5746     switch (EltVT.getSimpleVT().SimpleTy) {
5747     case MVT::f16:
5748       VVT = MVT::v2f16;
5749       break;
5750     case MVT::bf16:
5751       VVT = MVT::v2bf16;
5752       break;
5753     case MVT::i16:
5754       VVT = MVT::v2i16;
5755       break;
5756     default:
5757       llvm_unreachable("Unsupported v8 vector type.");
5758     }
5759     EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5760     LdResVTs = DAG.getVTList(ListVTs);
5761     break;
5762   }
5763   }
5764 
5765   // Copy regular operands
5766   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5767 
5768   // The select routine does not have access to the LoadSDNode instance, so
5769   // pass along the extension information
5770   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5771 
5772   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5773                                           LD->getMemoryVT(),
5774                                           LD->getMemOperand());
5775 
5776   SmallVector<SDValue, 8> ScalarRes;
5777   if (Load16x2) {
5778     // Split v2f16 subvectors back into individual elements.
5779     NumElts /= 2;
5780     for (unsigned i = 0; i < NumElts; ++i) {
5781       SDValue SubVector = NewLD.getValue(i);
5782       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5783                                DAG.getIntPtrConstant(0, DL));
5784       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5785                                DAG.getIntPtrConstant(1, DL));
5786       ScalarRes.push_back(E0);
5787       ScalarRes.push_back(E1);
5788     }
5789   } else {
5790     for (unsigned i = 0; i < NumElts; ++i) {
5791       SDValue Res = NewLD.getValue(i);
5792       if (NeedTrunc)
5793         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5794       ScalarRes.push_back(Res);
5795     }
5796   }
5797 
5798   SDValue LoadChain = NewLD.getValue(NumElts);
5799 
5800   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5801 
5802   Results.push_back(BuildVec);
5803   Results.push_back(LoadChain);
5804 }
5805 
5806 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
5807                                      SmallVectorImpl<SDValue> &Results) {
5808   SDValue Chain = N->getOperand(0);
5809   SDValue Intrin = N->getOperand(1);
5810   SDLoc DL(N);
5811 
5812   // Get the intrinsic ID
5813   unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5814   switch (IntrinNo) {
5815   default:
5816     return;
5817   case Intrinsic::nvvm_ldg_global_i:
5818   case Intrinsic::nvvm_ldg_global_f:
5819   case Intrinsic::nvvm_ldg_global_p:
5820   case Intrinsic::nvvm_ldu_global_i:
5821   case Intrinsic::nvvm_ldu_global_f:
5822   case Intrinsic::nvvm_ldu_global_p: {
5823     EVT ResVT = N->getValueType(0);
5824 
5825     if (ResVT.isVector()) {
5826       // Vector LDG/LDU
5827 
5828       unsigned NumElts = ResVT.getVectorNumElements();
5829       EVT EltVT = ResVT.getVectorElementType();
5830 
5831       // Since LDU/LDG are target nodes, we cannot rely on DAG type
5832       // legalization.
5833       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5834       // loaded type to i16 and propagate the "real" type as the memory type.
5835       bool NeedTrunc = false;
5836       if (EltVT.getSizeInBits() < 16) {
5837         EltVT = MVT::i16;
5838         NeedTrunc = true;
5839       }
5840 
5841       unsigned Opcode = 0;
5842       SDVTList LdResVTs;
5843 
5844       switch (NumElts) {
5845       default:
5846         return;
5847       case 2:
5848         switch (IntrinNo) {
5849         default:
5850           return;
5851         case Intrinsic::nvvm_ldg_global_i:
5852         case Intrinsic::nvvm_ldg_global_f:
5853         case Intrinsic::nvvm_ldg_global_p:
5854           Opcode = NVPTXISD::LDGV2;
5855           break;
5856         case Intrinsic::nvvm_ldu_global_i:
5857         case Intrinsic::nvvm_ldu_global_f:
5858         case Intrinsic::nvvm_ldu_global_p:
5859           Opcode = NVPTXISD::LDUV2;
5860           break;
5861         }
5862         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5863         break;
5864       case 4: {
5865         switch (IntrinNo) {
5866         default:
5867           return;
5868         case Intrinsic::nvvm_ldg_global_i:
5869         case Intrinsic::nvvm_ldg_global_f:
5870         case Intrinsic::nvvm_ldg_global_p:
5871           Opcode = NVPTXISD::LDGV4;
5872           break;
5873         case Intrinsic::nvvm_ldu_global_i:
5874         case Intrinsic::nvvm_ldu_global_f:
5875         case Intrinsic::nvvm_ldu_global_p:
5876           Opcode = NVPTXISD::LDUV4;
5877           break;
5878         }
5879         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5880         LdResVTs = DAG.getVTList(ListVTs);
5881         break;
5882       }
5883       }
5884 
5885       SmallVector<SDValue, 8> OtherOps;
5886 
5887       // Copy regular operands
5888 
5889       OtherOps.push_back(Chain); // Chain
5890                                  // Skip operand 1 (intrinsic ID)
5891       // Others
5892       OtherOps.append(N->op_begin() + 2, N->op_end());
5893 
5894       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5895 
5896       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5897                                               MemSD->getMemoryVT(),
5898                                               MemSD->getMemOperand());
5899 
5900       SmallVector<SDValue, 4> ScalarRes;
5901 
5902       for (unsigned i = 0; i < NumElts; ++i) {
5903         SDValue Res = NewLD.getValue(i);
5904         if (NeedTrunc)
5905           Res =
5906               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5907         ScalarRes.push_back(Res);
5908       }
5909 
5910       SDValue LoadChain = NewLD.getValue(NumElts);
5911 
5912       SDValue BuildVec =
5913           DAG.getBuildVector(ResVT, DL, ScalarRes);
5914 
5915       Results.push_back(BuildVec);
5916       Results.push_back(LoadChain);
5917     } else {
5918       // i8 LDG/LDU
5919       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5920              "Custom handling of non-i8 ldu/ldg?");
5921 
5922       // Just copy all operands as-is
5923       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5924 
5925       // Force output to i16
5926       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5927 
5928       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5929 
5930       // We make sure the memory type is i8, which will be used during isel
5931       // to select the proper instruction.
5932       SDValue NewLD =
5933           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5934                                   MVT::i8, MemSD->getMemOperand());
5935 
5936       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5937                                     NewLD.getValue(0)));
5938       Results.push_back(NewLD.getValue(1));
5939     }
5940   }
5941   }
5942 }
5943 
5944 void NVPTXTargetLowering::ReplaceNodeResults(
5945     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5946   switch (N->getOpcode()) {
5947   default:
5948     report_fatal_error("Unhandled custom legalization");
5949   case ISD::LOAD:
5950     ReplaceLoadVector(N, DAG, Results);
5951     return;
5952   case ISD::INTRINSIC_W_CHAIN:
5953     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5954     return;
5955   }
5956 }
5957 
5958 NVPTXTargetLowering::AtomicExpansionKind
5959 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
5960   Type *Ty = AI->getValOperand()->getType();
5961 
5962   if (AI->isFloatingPointOperation()) {
5963     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
5964       if (Ty->isFloatTy())
5965         return AtomicExpansionKind::None;
5966       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5967         return AtomicExpansionKind::None;
5968     }
5969     return AtomicExpansionKind::CmpXChg;
5970   }
5971 
5972   assert(Ty->isIntegerTy() && "Ty should be integer at this point");
5973   auto ITy = cast<llvm::IntegerType>(Ty);
5974 
5975   switch (AI->getOperation()) {
5976   default:
5977     return AtomicExpansionKind::CmpXChg;
5978   case AtomicRMWInst::BinOp::And:
5979   case AtomicRMWInst::BinOp::Or:
5980   case AtomicRMWInst::BinOp::Xor:
5981   case AtomicRMWInst::BinOp::Xchg:
5982     switch (ITy->getBitWidth()) {
5983     case 8:
5984     case 16:
5985       return AtomicExpansionKind::CmpXChg;
5986     case 32:
5987       return AtomicExpansionKind::None;
5988     case 64:
5989       if (STI.hasAtomBitwise64())
5990         return AtomicExpansionKind::None;
5991       return AtomicExpansionKind::CmpXChg;
5992     default:
5993       llvm_unreachable("unsupported width encountered");
5994     }
5995   case AtomicRMWInst::BinOp::Add:
5996   case AtomicRMWInst::BinOp::Sub:
5997   case AtomicRMWInst::BinOp::Max:
5998   case AtomicRMWInst::BinOp::Min:
5999   case AtomicRMWInst::BinOp::UMax:
6000   case AtomicRMWInst::BinOp::UMin:
6001     switch (ITy->getBitWidth()) {
6002     case 8:
6003     case 16:
6004       return AtomicExpansionKind::CmpXChg;
6005     case 32:
6006       return AtomicExpansionKind::None;
6007     case 64:
6008       if (STI.hasAtomMinMax64())
6009         return AtomicExpansionKind::None;
6010       return AtomicExpansionKind::CmpXChg;
6011     default:
6012       llvm_unreachable("unsupported width encountered");
6013     }
6014   }
6015 
6016   return AtomicExpansionKind::CmpXChg;
6017 }
6018 
6019 // Pin NVPTXTargetObjectFile's vtables to this file.
6020 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6021 
6022 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6023     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6024   return getDataSection();
6025 }
6026