xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (revision 59144db3fca192c4637637dfe6b5a5d98632cd47)
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/ISDOpcodes.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineMemOperand.h"
29 #include "llvm/CodeGen/MachineValueType.h"
30 #include "llvm/CodeGen/SelectionDAG.h"
31 #include "llvm/CodeGen/SelectionDAGNodes.h"
32 #include "llvm/CodeGen/TargetCallingConv.h"
33 #include "llvm/CodeGen/TargetLowering.h"
34 #include "llvm/CodeGen/ValueTypes.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/FPEnv.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalValue.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicsNVPTX.h"
47 #include "llvm/IR/Module.h"
48 #include "llvm/IR/Type.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Casting.h"
51 #include "llvm/Support/CodeGen.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/raw_ostream.h"
55 #include "llvm/Target/TargetMachine.h"
56 #include "llvm/Target/TargetOptions.h"
57 #include <algorithm>
58 #include <cassert>
59 #include <cmath>
60 #include <cstdint>
61 #include <iterator>
62 #include <sstream>
63 #include <string>
64 #include <utility>
65 #include <vector>
66 
67 #define DEBUG_TYPE "nvptx-lower"
68 
69 using namespace llvm;
70 
71 static std::atomic<unsigned> GlobalUniqueCallSite;
72 
73 static cl::opt<bool> sched4reg(
74     "nvptx-sched4reg",
75     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
76 
77 static cl::opt<unsigned> FMAContractLevelOpt(
78     "nvptx-fma-level", cl::Hidden,
79     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
80              " 1: do it  2: do it aggressively"),
81     cl::init(2));
82 
83 static cl::opt<int> UsePrecDivF32(
84     "nvptx-prec-divf32", cl::Hidden,
85     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
86              " IEEE Compliant F32 div.rnd if available."),
87     cl::init(2));
88 
89 static cl::opt<bool> UsePrecSqrtF32(
90     "nvptx-prec-sqrtf32", cl::Hidden,
91     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
92     cl::init(true));
93 
94 static cl::opt<bool> ForceMinByValParamAlign(
95     "nvptx-force-min-byval-param-align", cl::Hidden,
96     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
97              " params of device functions."),
98     cl::init(false));
99 
100 int NVPTXTargetLowering::getDivF32Level() const {
101   if (UsePrecDivF32.getNumOccurrences() > 0) {
102     // If nvptx-prec-div32=N is used on the command-line, always honor it
103     return UsePrecDivF32;
104   } else {
105     // Otherwise, use div.approx if fast math is enabled
106     if (getTargetMachine().Options.UnsafeFPMath)
107       return 0;
108     else
109       return 2;
110   }
111 }
112 
113 bool NVPTXTargetLowering::usePrecSqrtF32() const {
114   if (UsePrecSqrtF32.getNumOccurrences() > 0) {
115     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
116     return UsePrecSqrtF32;
117   } else {
118     // Otherwise, use sqrt.approx if fast math is enabled
119     return !getTargetMachine().Options.UnsafeFPMath;
120   }
121 }
122 
123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
124   return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
125          DenormalMode::PreserveSign;
126 }
127 
128 static bool IsPTXVectorType(MVT VT) {
129   switch (VT.SimpleTy) {
130   default:
131     return false;
132   case MVT::v2i1:
133   case MVT::v4i1:
134   case MVT::v2i8:
135   case MVT::v4i8:
136   case MVT::v2i16:
137   case MVT::v4i16:
138   case MVT::v8i16: // <4 x i16x2>
139   case MVT::v2i32:
140   case MVT::v4i32:
141   case MVT::v2i64:
142   case MVT::v2f16:
143   case MVT::v4f16:
144   case MVT::v8f16: // <4 x f16x2>
145   case MVT::v2bf16:
146   case MVT::v4bf16:
147   case MVT::v8bf16: // <4 x bf16x2>
148   case MVT::v2f32:
149   case MVT::v4f32:
150   case MVT::v2f64:
151     return true;
152   }
153 }
154 
155 static bool Is16bitsType(MVT VT) {
156   return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
157           VT.SimpleTy == MVT::i16);
158 }
159 
160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
161 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
162 /// into their primitive components.
163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
165 /// LowerCall, and LowerReturn.
166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
167                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
168                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
169                                uint64_t StartingOffset = 0) {
170   SmallVector<EVT, 16> TempVTs;
171   SmallVector<uint64_t, 16> TempOffsets;
172 
173   // Special case for i128 - decompose to (i64, i64)
174   if (Ty->isIntegerTy(128)) {
175     ValueVTs.push_back(EVT(MVT::i64));
176     ValueVTs.push_back(EVT(MVT::i64));
177 
178     if (Offsets) {
179       Offsets->push_back(StartingOffset + 0);
180       Offsets->push_back(StartingOffset + 8);
181     }
182 
183     return;
184   }
185 
186   // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
187   if (StructType *STy = dyn_cast<StructType>(Ty)) {
188     auto const *SL = DL.getStructLayout(STy);
189     auto ElementNum = 0;
190     for(auto *EI : STy->elements()) {
191       ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
192                          StartingOffset + SL->getElementOffset(ElementNum));
193       ++ElementNum;
194     }
195     return;
196   }
197 
198   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
199   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
200     EVT VT = TempVTs[i];
201     uint64_t Off = TempOffsets[i];
202     // Split vectors into individual elements, except for v2f16, which
203     // we will pass as a single scalar.
204     if (VT.isVector()) {
205       unsigned NumElts = VT.getVectorNumElements();
206       EVT EltVT = VT.getVectorElementType();
207       // Vectors with an even number of f16 elements will be passed to
208       // us as an array of v2f16/v2bf16 elements. We must match this so we
209       // stay in sync with Ins/Outs.
210       if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
211         switch (EltVT.getSimpleVT().SimpleTy) {
212         case MVT::f16:
213           EltVT = MVT::v2f16;
214           break;
215         case MVT::bf16:
216           EltVT = MVT::v2bf16;
217           break;
218         case MVT::i16:
219           EltVT = MVT::v2i16;
220           break;
221         default:
222           llvm_unreachable("Unexpected type");
223         }
224         NumElts /= 2;
225       } else if (EltVT.getSimpleVT() == MVT::i8 &&
226                  (NumElts % 4 == 0 || NumElts == 3)) {
227         // v*i8 are formally lowered as v4i8
228         EltVT = MVT::v4i8;
229         NumElts = (NumElts + 3) / 4;
230       }
231       for (unsigned j = 0; j != NumElts; ++j) {
232         ValueVTs.push_back(EltVT);
233         if (Offsets)
234           Offsets->push_back(Off + j * EltVT.getStoreSize());
235       }
236     } else {
237       ValueVTs.push_back(VT);
238       if (Offsets)
239         Offsets->push_back(Off);
240     }
241   }
242 }
243 
244 /// PromoteScalarIntegerPTX
245 /// Used to make sure the arguments/returns are suitable for passing
246 /// and promote them to a larger size if they're not.
247 ///
248 /// The promoted type is placed in \p PromoteVT if the function returns true.
249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
250   if (VT.isScalarInteger()) {
251     switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
252     default:
253       llvm_unreachable(
254           "Promotion is not suitable for scalars of size larger than 64-bits");
255     case 1:
256       *PromotedVT = MVT::i1;
257       break;
258     case 2:
259     case 4:
260     case 8:
261       *PromotedVT = MVT::i8;
262       break;
263     case 16:
264       *PromotedVT = MVT::i16;
265       break;
266     case 32:
267       *PromotedVT = MVT::i32;
268       break;
269     case 64:
270       *PromotedVT = MVT::i64;
271       break;
272     }
273     return EVT(*PromotedVT) != VT;
274   }
275   return false;
276 }
277 
278 // Check whether we can merge loads/stores of some of the pieces of a
279 // flattened function parameter or return value into a single vector
280 // load/store.
281 //
282 // The flattened parameter is represented as a list of EVTs and
283 // offsets, and the whole structure is aligned to ParamAlignment. This
284 // function determines whether we can load/store pieces of the
285 // parameter starting at index Idx using a single vectorized op of
286 // size AccessSize. If so, it returns the number of param pieces
287 // covered by the vector op. Otherwise, it returns 1.
288 static unsigned CanMergeParamLoadStoresStartingAt(
289     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
290     const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
291 
292   // Can't vectorize if param alignment is not sufficient.
293   if (ParamAlignment < AccessSize)
294     return 1;
295   // Can't vectorize if offset is not aligned.
296   if (Offsets[Idx] & (AccessSize - 1))
297     return 1;
298 
299   EVT EltVT = ValueVTs[Idx];
300   unsigned EltSize = EltVT.getStoreSize();
301 
302   // Element is too large to vectorize.
303   if (EltSize >= AccessSize)
304     return 1;
305 
306   unsigned NumElts = AccessSize / EltSize;
307   // Can't vectorize if AccessBytes if not a multiple of EltSize.
308   if (AccessSize != EltSize * NumElts)
309     return 1;
310 
311   // We don't have enough elements to vectorize.
312   if (Idx + NumElts > ValueVTs.size())
313     return 1;
314 
315   // PTX ISA can only deal with 2- and 4-element vector ops.
316   if (NumElts != 4 && NumElts != 2)
317     return 1;
318 
319   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
320     // Types do not match.
321     if (ValueVTs[j] != EltVT)
322       return 1;
323 
324     // Elements are not contiguous.
325     if (Offsets[j] - Offsets[j - 1] != EltSize)
326       return 1;
327   }
328   // OK. We can vectorize ValueVTs[i..i+NumElts)
329   return NumElts;
330 }
331 
332 // Flags for tracking per-element vectorization state of loads/stores
333 // of a flattened function parameter or return value.
334 enum ParamVectorizationFlags {
335   PVF_INNER = 0x0, // Middle elements of a vector.
336   PVF_FIRST = 0x1, // First element of the vector.
337   PVF_LAST = 0x2,  // Last element of the vector.
338   // Scalar is effectively a 1-element vector.
339   PVF_SCALAR = PVF_FIRST | PVF_LAST
340 };
341 
342 // Computes whether and how we can vectorize the loads/stores of a
343 // flattened function parameter or return value.
344 //
345 // The flattened parameter is represented as the list of ValueVTs and
346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
347 // of the same size as ValueVTs indicating how each piece should be
348 // loaded/stored (i.e. as a scalar, or as part of a vector
349 // load/store).
350 static SmallVector<ParamVectorizationFlags, 16>
351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
352                      const SmallVectorImpl<uint64_t> &Offsets,
353                      Align ParamAlignment, bool IsVAArg = false) {
354   // Set vector size to match ValueVTs and mark all elements as
355   // scalars by default.
356   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
357   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
358 
359   if (IsVAArg)
360     return VectorInfo;
361 
362   // Check what we can vectorize using 128/64/32-bit accesses.
363   for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
364     // Skip elements we've already processed.
365     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
366     for (unsigned AccessSize : {16, 8, 4, 2}) {
367       unsigned NumElts = CanMergeParamLoadStoresStartingAt(
368           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
369       // Mark vectorized elements.
370       switch (NumElts) {
371       default:
372         llvm_unreachable("Unexpected return value");
373       case 1:
374         // Can't vectorize using this size, try next smaller size.
375         continue;
376       case 2:
377         assert(I + 1 < E && "Not enough elements.");
378         VectorInfo[I] = PVF_FIRST;
379         VectorInfo[I + 1] = PVF_LAST;
380         I += 1;
381         break;
382       case 4:
383         assert(I + 3 < E && "Not enough elements.");
384         VectorInfo[I] = PVF_FIRST;
385         VectorInfo[I + 1] = PVF_INNER;
386         VectorInfo[I + 2] = PVF_INNER;
387         VectorInfo[I + 3] = PVF_LAST;
388         I += 3;
389         break;
390       }
391       // Break out of the inner loop because we've already succeeded
392       // using largest possible AccessSize.
393       break;
394     }
395   }
396   return VectorInfo;
397 }
398 
399 // NVPTXTargetLowering Constructor.
400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
401                                          const NVPTXSubtarget &STI)
402     : TargetLowering(TM), nvTM(&TM), STI(STI) {
403   // always lower memset, memcpy, and memmove intrinsics to load/store
404   // instructions, rather
405   // then generating calls to memset, mempcy or memmove.
406   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
407   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
408   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
409 
410   setBooleanContents(ZeroOrNegativeOneBooleanContent);
411   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
412 
413   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
414   // condition branches.
415   setJumpIsExpensive(true);
416 
417   // Wide divides are _very_ slow. Try to reduce the width of the divide if
418   // possible.
419   addBypassSlowDiv(64, 32);
420 
421   // By default, use the Source scheduling
422   if (sched4reg)
423     setSchedulingPreference(Sched::RegPressure);
424   else
425     setSchedulingPreference(Sched::Source);
426 
427   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
428                                     LegalizeAction NoF16Action) {
429     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
430   };
431 
432   auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
433                                     LegalizeAction NoBF16Action) {
434     bool IsOpSupported = STI.hasBF16Math();
435     // Few instructions are available on sm_90 only
436     switch(Op) {
437       case ISD::FADD:
438       case ISD::FMUL:
439       case ISD::FSUB:
440       case ISD::SELECT:
441       case ISD::SELECT_CC:
442       case ISD::SETCC:
443       case ISD::FEXP2:
444       case ISD::FCEIL:
445       case ISD::FFLOOR:
446       case ISD::FNEARBYINT:
447       case ISD::FRINT:
448       case ISD::FTRUNC:
449         IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
450         break;
451     }
452     setOperationAction(
453         Op, VT, IsOpSupported ? Action : NoBF16Action);
454   };
455 
456   auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
457                                      LegalizeAction NoI16x2Action) {
458     bool IsOpSupported = false;
459     // instructions are available on sm_90 only
460     switch (Op) {
461     case ISD::ADD:
462     case ISD::SMAX:
463     case ISD::SMIN:
464     case ISD::UMIN:
465     case ISD::UMAX:
466     case ISD::SUB:
467       IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
468       break;
469     }
470     setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
471   };
472 
473   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
474   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
475   addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
476   addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
477   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
478   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
479   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
480   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
481   addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
482   addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
483   addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
484   addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
485 
486   // Conversion to/from FP16/FP16x2 is always legal.
487   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
488   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
489   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
490   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
491 
492   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
493   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
494 
495   // Conversion to/from BFP16/BFP16x2 is always legal.
496   setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
497   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
498   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
499   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
500 
501   setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
502   setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
503   if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
504     AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
505 
506   // Conversion to/from i16/i16x2 is always legal.
507   setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
508   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
509   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
510   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
511 
512   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
513   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
514   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
515   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
516   // Only logical ops can be done on v4i8 directly, others must be done
517   // elementwise.
518   setOperationAction(
519       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
520        ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,
521        ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,
522        ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,
523        ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,
524        ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,
525        ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,
526        ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,
527        ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,
528        ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,
529        ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,
530        ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,
531        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
532        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
533        ISD::USUBSAT},
534       MVT::v4i8, Expand);
535 
536   // Operations not directly supported by NVPTX.
537   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
538                  MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
539                  MVT::i32, MVT::i64}) {
540     setOperationAction(ISD::SELECT_CC, VT, Expand);
541     setOperationAction(ISD::BR_CC, VT, Expand);
542   }
543 
544   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
545   // For others we will expand to a SHL/SRA pair.
546   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
547   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
548   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
549   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
550   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
551   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
552 
553   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
554   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
555   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
556   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
557   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
558   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
559 
560   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
561   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
562 
563   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
564   // that don't have h/w rotation we lower them to multi-instruction assembly.
565   // See ROT*_sw in NVPTXIntrInfo.td
566   setOperationAction(ISD::ROTL, MVT::i64, Legal);
567   setOperationAction(ISD::ROTR, MVT::i64, Legal);
568   setOperationAction(ISD::ROTL, MVT::i32, Legal);
569   setOperationAction(ISD::ROTR, MVT::i32, Legal);
570 
571   setOperationAction(ISD::ROTL, MVT::i16, Expand);
572   setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
573   setOperationAction(ISD::ROTR, MVT::i16, Expand);
574   setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
575   setOperationAction(ISD::ROTL, MVT::i8, Expand);
576   setOperationAction(ISD::ROTR, MVT::i8, Expand);
577   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
578   setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
579   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
580   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
581 
582   // Indirect branch is not supported.
583   // This also disables Jump Table creation.
584   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
585   setOperationAction(ISD::BRIND, MVT::Other, Expand);
586 
587   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
588   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
589 
590   // We want to legalize constant related memmove and memcopy
591   // intrinsics.
592   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
593 
594   // Turn FP extload into load/fpextend
595   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
596   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
597   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
598   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
599   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
600   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
601   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
602   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
603   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
604   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
605   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
606   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
607   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
608   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
609   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
610   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
611   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
612   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
613   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
614   // Turn FP truncstore into trunc + store.
615   // FIXME: vector types should also be expanded
616   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
617   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
618   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
619   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
620   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
621 
622   // PTX does not support load / store predicate registers
623   setOperationAction(ISD::LOAD, MVT::i1, Custom);
624   setOperationAction(ISD::STORE, MVT::i1, Custom);
625 
626   for (MVT VT : MVT::integer_valuetypes()) {
627     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
628     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
629     setTruncStoreAction(VT, MVT::i1, Expand);
630   }
631 
632   // expand extload of vector of integers.
633   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
634                    MVT::v2i8, Expand);
635   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
636 
637   // This is legal in NVPTX
638   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
639   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
640   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
641   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
642 
643   // Lowering of DYNAMIC_STACKALLOC is unsupported.
644   // Custom lower to produce an error.
645   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
646   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
647 
648   // TRAP can be lowered to PTX trap
649   setOperationAction(ISD::TRAP, MVT::Other, Legal);
650 
651   // Register custom handling for vector loads/stores
652   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
653     if (IsPTXVectorType(VT)) {
654       setOperationAction(ISD::LOAD, VT, Custom);
655       setOperationAction(ISD::STORE, VT, Custom);
656       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
657     }
658   }
659 
660   // Support varargs.
661   setOperationAction(ISD::VASTART, MVT::Other, Custom);
662   setOperationAction(ISD::VAARG, MVT::Other, Custom);
663   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
664   setOperationAction(ISD::VAEND, MVT::Other, Expand);
665 
666   // Custom handling for i8 intrinsics
667   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
668 
669   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
670     setOperationAction(ISD::ABS,  Ty, Legal);
671     setOperationAction(ISD::SMIN, Ty, Legal);
672     setOperationAction(ISD::SMAX, Ty, Legal);
673     setOperationAction(ISD::UMIN, Ty, Legal);
674     setOperationAction(ISD::UMAX, Ty, Legal);
675 
676     setOperationAction(ISD::CTPOP, Ty, Legal);
677     setOperationAction(ISD::CTLZ, Ty, Legal);
678   }
679 
680   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
681   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
682   setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
683   setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
684   setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
685   setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
686   setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
687 
688   setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
689   setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
690   setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
691   setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
692   setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
693   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
694 
695   // Other arithmetic and logic ops are unsupported.
696   setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
697                       ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
698                       ISD::SINT_TO_FP, ISD::UINT_TO_FP},
699                      MVT::v2i16, Expand);
700 
701   setOperationAction(ISD::ADDC, MVT::i32, Legal);
702   setOperationAction(ISD::ADDE, MVT::i32, Legal);
703   setOperationAction(ISD::SUBC, MVT::i32, Legal);
704   setOperationAction(ISD::SUBE, MVT::i32, Legal);
705   if (STI.getPTXVersion() >= 43) {
706     setOperationAction(ISD::ADDC, MVT::i64, Legal);
707     setOperationAction(ISD::ADDE, MVT::i64, Legal);
708     setOperationAction(ISD::SUBC, MVT::i64, Legal);
709     setOperationAction(ISD::SUBE, MVT::i64, Legal);
710   }
711 
712   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
713   setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
714   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
715   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
716 
717   // PTX does not directly support SELP of i1, so promote to i32 first
718   setOperationAction(ISD::SELECT, MVT::i1, Custom);
719 
720   // PTX cannot multiply two i64s in a single instruction.
721   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
722   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
723 
724   // We have some custom DAG combine patterns for these nodes
725   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
726                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
727                        ISD::VSELECT});
728 
729   // setcc for f16x2 and bf16x2 needs special handling to prevent
730   // legalizer's attempt to scalarize it due to v2i1 not being legal.
731   if (STI.allowFP16Math() || STI.hasBF16Math())
732     setTargetDAGCombine(ISD::SETCC);
733 
734   // Promote fp16 arithmetic if fp16 hardware isn't available or the
735   // user passed --nvptx-no-fp16-math. The flag is useful because,
736   // although sm_53+ GPUs have some sort of FP16 support in
737   // hardware, only sm_53 and sm_60 have full implementation. Others
738   // only have token amount of hardware and are likely to run faster
739   // by using fp32 units instead.
740   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
741     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
742     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
743     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
744     // bf16 must be promoted to f32.
745     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
746     if (getOperationAction(Op, MVT::bf16) == Promote)
747       AddPromotedToType(Op, MVT::bf16, MVT::f32);
748   }
749 
750   // f16/f16x2 neg was introduced in PTX 60, SM_53.
751   const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
752                                         STI.getPTXVersion() >= 60 &&
753                                         STI.allowFP16Math();
754   for (const auto &VT : {MVT::f16, MVT::v2f16})
755     setOperationAction(ISD::FNEG, VT,
756                        IsFP16FP16x2NegAvailable ? Legal : Expand);
757 
758   setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
759   setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
760   // (would be) Library functions.
761 
762   // These map to conversion instructions for scalar FP types.
763   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
764                          ISD::FROUNDEVEN, ISD::FTRUNC}) {
765     setOperationAction(Op, MVT::f16, Legal);
766     setOperationAction(Op, MVT::f32, Legal);
767     setOperationAction(Op, MVT::f64, Legal);
768     setOperationAction(Op, MVT::v2f16, Expand);
769     setOperationAction(Op, MVT::v2bf16, Expand);
770     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
771     if (getOperationAction(Op, MVT::bf16) == Promote)
772       AddPromotedToType(Op, MVT::bf16, MVT::f32);
773   }
774 
775   // sm_80 only has conversions between f32 and bf16. Custom lower all other
776   // bf16 conversions.
777   if (STI.hasBF16Math() &&
778       (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
779     for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
780       setOperationAction(
781           {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
782           VT, Custom);
783     }
784   }
785 
786   setOperationAction(ISD::FROUND, MVT::f16, Promote);
787   setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
788   setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
789   setOperationAction(ISD::FROUND, MVT::f32, Custom);
790   setOperationAction(ISD::FROUND, MVT::f64, Custom);
791   setOperationAction(ISD::FROUND, MVT::bf16, Promote);
792   AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
793 
794   // 'Expand' implements FCOPYSIGN without calling an external library.
795   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
796   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
797   setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
798   setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
799   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
800   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
801 
802   // These map to corresponding instructions for f32/f64. f16 must be
803   // promoted to f32. v2f16 is expanded to f16, which is then promoted
804   // to f32.
805   for (const auto &Op :
806        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
807     setOperationAction(Op, MVT::f16, Promote);
808     setOperationAction(Op, MVT::f32, Legal);
809     setOperationAction(Op, MVT::f64, Legal);
810     setOperationAction(Op, MVT::v2f16, Expand);
811     setOperationAction(Op, MVT::v2bf16, Expand);
812     setOperationAction(Op, MVT::bf16, Promote);
813     AddPromotedToType(Op, MVT::bf16, MVT::f32);
814   }
815   for (const auto &Op : {ISD::FABS}) {
816     setOperationAction(Op, MVT::f16, Promote);
817     setOperationAction(Op, MVT::f32, Legal);
818     setOperationAction(Op, MVT::f64, Legal);
819     setOperationAction(Op, MVT::v2f16, Expand);
820     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
821     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
822     if (getOperationAction(Op, MVT::bf16) == Promote)
823       AddPromotedToType(Op, MVT::bf16, MVT::f32);
824   }
825 
826   // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
827   auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
828     bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
829     return IsAtLeastSm80 ? Legal : NotSm80Action;
830   };
831   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
832     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
833     setOperationAction(Op, MVT::f32, Legal);
834     setOperationAction(Op, MVT::f64, Legal);
835     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
836     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
837     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
838     if (getOperationAction(Op, MVT::bf16) == Promote)
839       AddPromotedToType(Op, MVT::bf16, MVT::f32);
840   }
841   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
842     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
843     setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
844     setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
845     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
846     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
847   }
848 
849   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
850   // No FPOW or FREM in PTX.
851 
852   // Now deduce the information based on the above mentioned
853   // actions
854   computeRegisterProperties(STI.getRegisterInfo());
855 
856   setMinCmpXchgSizeInBits(32);
857   setMaxAtomicSizeInBitsSupported(64);
858 }
859 
860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
861   switch ((NVPTXISD::NodeType)Opcode) {
862   case NVPTXISD::FIRST_NUMBER:
863     break;
864   case NVPTXISD::CALL:
865     return "NVPTXISD::CALL";
866   case NVPTXISD::RET_GLUE:
867     return "NVPTXISD::RET_GLUE";
868   case NVPTXISD::LOAD_PARAM:
869     return "NVPTXISD::LOAD_PARAM";
870   case NVPTXISD::Wrapper:
871     return "NVPTXISD::Wrapper";
872   case NVPTXISD::DeclareParam:
873     return "NVPTXISD::DeclareParam";
874   case NVPTXISD::DeclareScalarParam:
875     return "NVPTXISD::DeclareScalarParam";
876   case NVPTXISD::DeclareRet:
877     return "NVPTXISD::DeclareRet";
878   case NVPTXISD::DeclareScalarRet:
879     return "NVPTXISD::DeclareScalarRet";
880   case NVPTXISD::DeclareRetParam:
881     return "NVPTXISD::DeclareRetParam";
882   case NVPTXISD::PrintCall:
883     return "NVPTXISD::PrintCall";
884   case NVPTXISD::PrintConvergentCall:
885     return "NVPTXISD::PrintConvergentCall";
886   case NVPTXISD::PrintCallUni:
887     return "NVPTXISD::PrintCallUni";
888   case NVPTXISD::PrintConvergentCallUni:
889     return "NVPTXISD::PrintConvergentCallUni";
890   case NVPTXISD::LoadParam:
891     return "NVPTXISD::LoadParam";
892   case NVPTXISD::LoadParamV2:
893     return "NVPTXISD::LoadParamV2";
894   case NVPTXISD::LoadParamV4:
895     return "NVPTXISD::LoadParamV4";
896   case NVPTXISD::StoreParam:
897     return "NVPTXISD::StoreParam";
898   case NVPTXISD::StoreParamV2:
899     return "NVPTXISD::StoreParamV2";
900   case NVPTXISD::StoreParamV4:
901     return "NVPTXISD::StoreParamV4";
902   case NVPTXISD::StoreParamS32:
903     return "NVPTXISD::StoreParamS32";
904   case NVPTXISD::StoreParamU32:
905     return "NVPTXISD::StoreParamU32";
906   case NVPTXISD::CallArgBegin:
907     return "NVPTXISD::CallArgBegin";
908   case NVPTXISD::CallArg:
909     return "NVPTXISD::CallArg";
910   case NVPTXISD::LastCallArg:
911     return "NVPTXISD::LastCallArg";
912   case NVPTXISD::CallArgEnd:
913     return "NVPTXISD::CallArgEnd";
914   case NVPTXISD::CallVoid:
915     return "NVPTXISD::CallVoid";
916   case NVPTXISD::CallVal:
917     return "NVPTXISD::CallVal";
918   case NVPTXISD::CallSymbol:
919     return "NVPTXISD::CallSymbol";
920   case NVPTXISD::Prototype:
921     return "NVPTXISD::Prototype";
922   case NVPTXISD::MoveParam:
923     return "NVPTXISD::MoveParam";
924   case NVPTXISD::StoreRetval:
925     return "NVPTXISD::StoreRetval";
926   case NVPTXISD::StoreRetvalV2:
927     return "NVPTXISD::StoreRetvalV2";
928   case NVPTXISD::StoreRetvalV4:
929     return "NVPTXISD::StoreRetvalV4";
930   case NVPTXISD::PseudoUseParam:
931     return "NVPTXISD::PseudoUseParam";
932   case NVPTXISD::RETURN:
933     return "NVPTXISD::RETURN";
934   case NVPTXISD::CallSeqBegin:
935     return "NVPTXISD::CallSeqBegin";
936   case NVPTXISD::CallSeqEnd:
937     return "NVPTXISD::CallSeqEnd";
938   case NVPTXISD::CallPrototype:
939     return "NVPTXISD::CallPrototype";
940   case NVPTXISD::ProxyReg:
941     return "NVPTXISD::ProxyReg";
942   case NVPTXISD::LoadV2:
943     return "NVPTXISD::LoadV2";
944   case NVPTXISD::LoadV4:
945     return "NVPTXISD::LoadV4";
946   case NVPTXISD::LDGV2:
947     return "NVPTXISD::LDGV2";
948   case NVPTXISD::LDGV4:
949     return "NVPTXISD::LDGV4";
950   case NVPTXISD::LDUV2:
951     return "NVPTXISD::LDUV2";
952   case NVPTXISD::LDUV4:
953     return "NVPTXISD::LDUV4";
954   case NVPTXISD::StoreV2:
955     return "NVPTXISD::StoreV2";
956   case NVPTXISD::StoreV4:
957     return "NVPTXISD::StoreV4";
958   case NVPTXISD::FUN_SHFL_CLAMP:
959     return "NVPTXISD::FUN_SHFL_CLAMP";
960   case NVPTXISD::FUN_SHFR_CLAMP:
961     return "NVPTXISD::FUN_SHFR_CLAMP";
962   case NVPTXISD::IMAD:
963     return "NVPTXISD::IMAD";
964   case NVPTXISD::BFE:
965     return "NVPTXISD::BFE";
966   case NVPTXISD::BFI:
967     return "NVPTXISD::BFI";
968   case NVPTXISD::PRMT:
969     return "NVPTXISD::PRMT";
970   case NVPTXISD::SETP_F16X2:
971     return "NVPTXISD::SETP_F16X2";
972   case NVPTXISD::SETP_BF16X2:
973     return "NVPTXISD::SETP_BF16X2";
974   case NVPTXISD::Dummy:
975     return "NVPTXISD::Dummy";
976   case NVPTXISD::MUL_WIDE_SIGNED:
977     return "NVPTXISD::MUL_WIDE_SIGNED";
978   case NVPTXISD::MUL_WIDE_UNSIGNED:
979     return "NVPTXISD::MUL_WIDE_UNSIGNED";
980   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
981   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
982   case NVPTXISD::Tex1DFloatFloatLevel:
983     return "NVPTXISD::Tex1DFloatFloatLevel";
984   case NVPTXISD::Tex1DFloatFloatGrad:
985     return "NVPTXISD::Tex1DFloatFloatGrad";
986   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
987   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
988   case NVPTXISD::Tex1DS32FloatLevel:
989     return "NVPTXISD::Tex1DS32FloatLevel";
990   case NVPTXISD::Tex1DS32FloatGrad:
991     return "NVPTXISD::Tex1DS32FloatGrad";
992   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
993   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
994   case NVPTXISD::Tex1DU32FloatLevel:
995     return "NVPTXISD::Tex1DU32FloatLevel";
996   case NVPTXISD::Tex1DU32FloatGrad:
997     return "NVPTXISD::Tex1DU32FloatGrad";
998   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
999   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
1000   case NVPTXISD::Tex1DArrayFloatFloatLevel:
1001     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
1002   case NVPTXISD::Tex1DArrayFloatFloatGrad:
1003     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
1004   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
1005   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
1006   case NVPTXISD::Tex1DArrayS32FloatLevel:
1007     return "NVPTXISD::Tex1DArrayS32FloatLevel";
1008   case NVPTXISD::Tex1DArrayS32FloatGrad:
1009     return "NVPTXISD::Tex1DArrayS32FloatGrad";
1010   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
1011   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
1012   case NVPTXISD::Tex1DArrayU32FloatLevel:
1013     return "NVPTXISD::Tex1DArrayU32FloatLevel";
1014   case NVPTXISD::Tex1DArrayU32FloatGrad:
1015     return "NVPTXISD::Tex1DArrayU32FloatGrad";
1016   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
1017   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
1018   case NVPTXISD::Tex2DFloatFloatLevel:
1019     return "NVPTXISD::Tex2DFloatFloatLevel";
1020   case NVPTXISD::Tex2DFloatFloatGrad:
1021     return "NVPTXISD::Tex2DFloatFloatGrad";
1022   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
1023   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
1024   case NVPTXISD::Tex2DS32FloatLevel:
1025     return "NVPTXISD::Tex2DS32FloatLevel";
1026   case NVPTXISD::Tex2DS32FloatGrad:
1027     return "NVPTXISD::Tex2DS32FloatGrad";
1028   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
1029   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
1030   case NVPTXISD::Tex2DU32FloatLevel:
1031     return "NVPTXISD::Tex2DU32FloatLevel";
1032   case NVPTXISD::Tex2DU32FloatGrad:
1033     return "NVPTXISD::Tex2DU32FloatGrad";
1034   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
1035   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
1036   case NVPTXISD::Tex2DArrayFloatFloatLevel:
1037     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
1038   case NVPTXISD::Tex2DArrayFloatFloatGrad:
1039     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
1040   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
1041   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
1042   case NVPTXISD::Tex2DArrayS32FloatLevel:
1043     return "NVPTXISD::Tex2DArrayS32FloatLevel";
1044   case NVPTXISD::Tex2DArrayS32FloatGrad:
1045     return "NVPTXISD::Tex2DArrayS32FloatGrad";
1046   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
1047   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
1048   case NVPTXISD::Tex2DArrayU32FloatLevel:
1049     return "NVPTXISD::Tex2DArrayU32FloatLevel";
1050   case NVPTXISD::Tex2DArrayU32FloatGrad:
1051     return "NVPTXISD::Tex2DArrayU32FloatGrad";
1052   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
1053   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
1054   case NVPTXISD::Tex3DFloatFloatLevel:
1055     return "NVPTXISD::Tex3DFloatFloatLevel";
1056   case NVPTXISD::Tex3DFloatFloatGrad:
1057     return "NVPTXISD::Tex3DFloatFloatGrad";
1058   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
1059   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
1060   case NVPTXISD::Tex3DS32FloatLevel:
1061     return "NVPTXISD::Tex3DS32FloatLevel";
1062   case NVPTXISD::Tex3DS32FloatGrad:
1063     return "NVPTXISD::Tex3DS32FloatGrad";
1064   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
1065   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
1066   case NVPTXISD::Tex3DU32FloatLevel:
1067     return "NVPTXISD::Tex3DU32FloatLevel";
1068   case NVPTXISD::Tex3DU32FloatGrad:
1069     return "NVPTXISD::Tex3DU32FloatGrad";
1070   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
1071   case NVPTXISD::TexCubeFloatFloatLevel:
1072     return "NVPTXISD::TexCubeFloatFloatLevel";
1073   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
1074   case NVPTXISD::TexCubeS32FloatLevel:
1075     return "NVPTXISD::TexCubeS32FloatLevel";
1076   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
1077   case NVPTXISD::TexCubeU32FloatLevel:
1078     return "NVPTXISD::TexCubeU32FloatLevel";
1079   case NVPTXISD::TexCubeArrayFloatFloat:
1080     return "NVPTXISD::TexCubeArrayFloatFloat";
1081   case NVPTXISD::TexCubeArrayFloatFloatLevel:
1082     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
1083   case NVPTXISD::TexCubeArrayS32Float:
1084     return "NVPTXISD::TexCubeArrayS32Float";
1085   case NVPTXISD::TexCubeArrayS32FloatLevel:
1086     return "NVPTXISD::TexCubeArrayS32FloatLevel";
1087   case NVPTXISD::TexCubeArrayU32Float:
1088     return "NVPTXISD::TexCubeArrayU32Float";
1089   case NVPTXISD::TexCubeArrayU32FloatLevel:
1090     return "NVPTXISD::TexCubeArrayU32FloatLevel";
1091   case NVPTXISD::Tld4R2DFloatFloat:
1092     return "NVPTXISD::Tld4R2DFloatFloat";
1093   case NVPTXISD::Tld4G2DFloatFloat:
1094     return "NVPTXISD::Tld4G2DFloatFloat";
1095   case NVPTXISD::Tld4B2DFloatFloat:
1096     return "NVPTXISD::Tld4B2DFloatFloat";
1097   case NVPTXISD::Tld4A2DFloatFloat:
1098     return "NVPTXISD::Tld4A2DFloatFloat";
1099   case NVPTXISD::Tld4R2DS64Float:
1100     return "NVPTXISD::Tld4R2DS64Float";
1101   case NVPTXISD::Tld4G2DS64Float:
1102     return "NVPTXISD::Tld4G2DS64Float";
1103   case NVPTXISD::Tld4B2DS64Float:
1104     return "NVPTXISD::Tld4B2DS64Float";
1105   case NVPTXISD::Tld4A2DS64Float:
1106     return "NVPTXISD::Tld4A2DS64Float";
1107   case NVPTXISD::Tld4R2DU64Float:
1108     return "NVPTXISD::Tld4R2DU64Float";
1109   case NVPTXISD::Tld4G2DU64Float:
1110     return "NVPTXISD::Tld4G2DU64Float";
1111   case NVPTXISD::Tld4B2DU64Float:
1112     return "NVPTXISD::Tld4B2DU64Float";
1113   case NVPTXISD::Tld4A2DU64Float:
1114     return "NVPTXISD::Tld4A2DU64Float";
1115 
1116   case NVPTXISD::TexUnified1DFloatS32:
1117     return "NVPTXISD::TexUnified1DFloatS32";
1118   case NVPTXISD::TexUnified1DFloatFloat:
1119     return "NVPTXISD::TexUnified1DFloatFloat";
1120   case NVPTXISD::TexUnified1DFloatFloatLevel:
1121     return "NVPTXISD::TexUnified1DFloatFloatLevel";
1122   case NVPTXISD::TexUnified1DFloatFloatGrad:
1123     return "NVPTXISD::TexUnified1DFloatFloatGrad";
1124   case NVPTXISD::TexUnified1DS32S32:
1125     return "NVPTXISD::TexUnified1DS32S32";
1126   case NVPTXISD::TexUnified1DS32Float:
1127     return "NVPTXISD::TexUnified1DS32Float";
1128   case NVPTXISD::TexUnified1DS32FloatLevel:
1129     return "NVPTXISD::TexUnified1DS32FloatLevel";
1130   case NVPTXISD::TexUnified1DS32FloatGrad:
1131     return "NVPTXISD::TexUnified1DS32FloatGrad";
1132   case NVPTXISD::TexUnified1DU32S32:
1133     return "NVPTXISD::TexUnified1DU32S32";
1134   case NVPTXISD::TexUnified1DU32Float:
1135     return "NVPTXISD::TexUnified1DU32Float";
1136   case NVPTXISD::TexUnified1DU32FloatLevel:
1137     return "NVPTXISD::TexUnified1DU32FloatLevel";
1138   case NVPTXISD::TexUnified1DU32FloatGrad:
1139     return "NVPTXISD::TexUnified1DU32FloatGrad";
1140   case NVPTXISD::TexUnified1DArrayFloatS32:
1141     return "NVPTXISD::TexUnified1DArrayFloatS32";
1142   case NVPTXISD::TexUnified1DArrayFloatFloat:
1143     return "NVPTXISD::TexUnified1DArrayFloatFloat";
1144   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
1145     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
1146   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
1147     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
1148   case NVPTXISD::TexUnified1DArrayS32S32:
1149     return "NVPTXISD::TexUnified1DArrayS32S32";
1150   case NVPTXISD::TexUnified1DArrayS32Float:
1151     return "NVPTXISD::TexUnified1DArrayS32Float";
1152   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
1153     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
1154   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
1155     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
1156   case NVPTXISD::TexUnified1DArrayU32S32:
1157     return "NVPTXISD::TexUnified1DArrayU32S32";
1158   case NVPTXISD::TexUnified1DArrayU32Float:
1159     return "NVPTXISD::TexUnified1DArrayU32Float";
1160   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
1161     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
1162   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
1163     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
1164   case NVPTXISD::TexUnified2DFloatS32:
1165     return "NVPTXISD::TexUnified2DFloatS32";
1166   case NVPTXISD::TexUnified2DFloatFloat:
1167     return "NVPTXISD::TexUnified2DFloatFloat";
1168   case NVPTXISD::TexUnified2DFloatFloatLevel:
1169     return "NVPTXISD::TexUnified2DFloatFloatLevel";
1170   case NVPTXISD::TexUnified2DFloatFloatGrad:
1171     return "NVPTXISD::TexUnified2DFloatFloatGrad";
1172   case NVPTXISD::TexUnified2DS32S32:
1173     return "NVPTXISD::TexUnified2DS32S32";
1174   case NVPTXISD::TexUnified2DS32Float:
1175     return "NVPTXISD::TexUnified2DS32Float";
1176   case NVPTXISD::TexUnified2DS32FloatLevel:
1177     return "NVPTXISD::TexUnified2DS32FloatLevel";
1178   case NVPTXISD::TexUnified2DS32FloatGrad:
1179     return "NVPTXISD::TexUnified2DS32FloatGrad";
1180   case NVPTXISD::TexUnified2DU32S32:
1181     return "NVPTXISD::TexUnified2DU32S32";
1182   case NVPTXISD::TexUnified2DU32Float:
1183     return "NVPTXISD::TexUnified2DU32Float";
1184   case NVPTXISD::TexUnified2DU32FloatLevel:
1185     return "NVPTXISD::TexUnified2DU32FloatLevel";
1186   case NVPTXISD::TexUnified2DU32FloatGrad:
1187     return "NVPTXISD::TexUnified2DU32FloatGrad";
1188   case NVPTXISD::TexUnified2DArrayFloatS32:
1189     return "NVPTXISD::TexUnified2DArrayFloatS32";
1190   case NVPTXISD::TexUnified2DArrayFloatFloat:
1191     return "NVPTXISD::TexUnified2DArrayFloatFloat";
1192   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
1193     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
1194   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
1195     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
1196   case NVPTXISD::TexUnified2DArrayS32S32:
1197     return "NVPTXISD::TexUnified2DArrayS32S32";
1198   case NVPTXISD::TexUnified2DArrayS32Float:
1199     return "NVPTXISD::TexUnified2DArrayS32Float";
1200   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
1201     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
1202   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
1203     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
1204   case NVPTXISD::TexUnified2DArrayU32S32:
1205     return "NVPTXISD::TexUnified2DArrayU32S32";
1206   case NVPTXISD::TexUnified2DArrayU32Float:
1207     return "NVPTXISD::TexUnified2DArrayU32Float";
1208   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
1209     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
1210   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
1211     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
1212   case NVPTXISD::TexUnified3DFloatS32:
1213     return "NVPTXISD::TexUnified3DFloatS32";
1214   case NVPTXISD::TexUnified3DFloatFloat:
1215     return "NVPTXISD::TexUnified3DFloatFloat";
1216   case NVPTXISD::TexUnified3DFloatFloatLevel:
1217     return "NVPTXISD::TexUnified3DFloatFloatLevel";
1218   case NVPTXISD::TexUnified3DFloatFloatGrad:
1219     return "NVPTXISD::TexUnified3DFloatFloatGrad";
1220   case NVPTXISD::TexUnified3DS32S32:
1221     return "NVPTXISD::TexUnified3DS32S32";
1222   case NVPTXISD::TexUnified3DS32Float:
1223     return "NVPTXISD::TexUnified3DS32Float";
1224   case NVPTXISD::TexUnified3DS32FloatLevel:
1225     return "NVPTXISD::TexUnified3DS32FloatLevel";
1226   case NVPTXISD::TexUnified3DS32FloatGrad:
1227     return "NVPTXISD::TexUnified3DS32FloatGrad";
1228   case NVPTXISD::TexUnified3DU32S32:
1229     return "NVPTXISD::TexUnified3DU32S32";
1230   case NVPTXISD::TexUnified3DU32Float:
1231     return "NVPTXISD::TexUnified3DU32Float";
1232   case NVPTXISD::TexUnified3DU32FloatLevel:
1233     return "NVPTXISD::TexUnified3DU32FloatLevel";
1234   case NVPTXISD::TexUnified3DU32FloatGrad:
1235     return "NVPTXISD::TexUnified3DU32FloatGrad";
1236   case NVPTXISD::TexUnifiedCubeFloatFloat:
1237     return "NVPTXISD::TexUnifiedCubeFloatFloat";
1238   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
1239     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
1240   case NVPTXISD::TexUnifiedCubeS32Float:
1241     return "NVPTXISD::TexUnifiedCubeS32Float";
1242   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
1243     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
1244   case NVPTXISD::TexUnifiedCubeU32Float:
1245     return "NVPTXISD::TexUnifiedCubeU32Float";
1246   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
1247     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
1248   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
1249     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
1250   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
1251     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
1252   case NVPTXISD::TexUnifiedCubeArrayS32Float:
1253     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
1254   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
1255     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
1256   case NVPTXISD::TexUnifiedCubeArrayU32Float:
1257     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
1258   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
1259     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
1260   case NVPTXISD::TexUnifiedCubeFloatFloatGrad:
1261     return "NVPTXISD::TexUnifiedCubeFloatFloatGrad";
1262   case NVPTXISD::TexUnifiedCubeS32FloatGrad:
1263     return "NVPTXISD::TexUnifiedCubeS32FloatGrad";
1264   case NVPTXISD::TexUnifiedCubeU32FloatGrad:
1265     return "NVPTXISD::TexUnifiedCubeU32FloatGrad";
1266   case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:
1267     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad";
1268   case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:
1269     return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad";
1270   case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:
1271     return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad";
1272   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
1273     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
1274   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
1275     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
1276   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
1277     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
1278   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
1279     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
1280   case NVPTXISD::Tld4UnifiedR2DS64Float:
1281     return "NVPTXISD::Tld4UnifiedR2DS64Float";
1282   case NVPTXISD::Tld4UnifiedG2DS64Float:
1283     return "NVPTXISD::Tld4UnifiedG2DS64Float";
1284   case NVPTXISD::Tld4UnifiedB2DS64Float:
1285     return "NVPTXISD::Tld4UnifiedB2DS64Float";
1286   case NVPTXISD::Tld4UnifiedA2DS64Float:
1287     return "NVPTXISD::Tld4UnifiedA2DS64Float";
1288   case NVPTXISD::Tld4UnifiedR2DU64Float:
1289     return "NVPTXISD::Tld4UnifiedR2DU64Float";
1290   case NVPTXISD::Tld4UnifiedG2DU64Float:
1291     return "NVPTXISD::Tld4UnifiedG2DU64Float";
1292   case NVPTXISD::Tld4UnifiedB2DU64Float:
1293     return "NVPTXISD::Tld4UnifiedB2DU64Float";
1294   case NVPTXISD::Tld4UnifiedA2DU64Float:
1295     return "NVPTXISD::Tld4UnifiedA2DU64Float";
1296 
1297   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1298   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1299   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1300   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1301   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1302   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1303   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1304   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1305   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1306   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1307   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1308 
1309   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1310   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1311   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1312   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1313   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1314   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1315   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1316   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1317   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1318   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1319   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1320 
1321   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1322   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1323   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1324   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1325   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1326   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1327   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1328   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1329   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1330   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1331   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1332 
1333   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1334   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1335   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1336   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1337   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1338   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1339   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1340   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1341   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1342   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1343   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1344 
1345   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1346   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1347   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1348   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1349   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1350   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1351   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1352   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1353   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1354   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1355   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1356 
1357   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1358   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1359   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1360   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1361   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1362   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1363   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1364   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1365   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1366   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1367   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1368 
1369   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1370   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1371   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1372   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1373   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1374   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1375   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1376   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1377   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1378   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1379   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1380 
1381   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1382   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1383   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1384   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1385   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1386   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1387   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1388   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1389   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1390   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1391   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1392 
1393   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1394   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1395   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1396   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1397   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1398   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1399   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1400   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1401   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1402   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1403   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1404 
1405   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1406   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1407   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1408   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1409   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1410   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1411   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1412   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1413   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1414   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1415   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1416 
1417   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1418   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1419   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1420   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1421   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1422   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1423   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1424   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1425   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1426   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1427   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1428 
1429   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1430   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1431   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1432   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1433   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1434   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1435   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1436   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1437   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1438   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1439   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1440 
1441   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1442   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1443   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1444   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1445   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1446   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1447   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1448   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1449   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1450   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1451   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1452 
1453   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1454   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1455   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1456   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1457   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1458   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1459   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1460   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1461   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1462   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1463   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1464 
1465   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1466   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1467   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1468   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1469   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1470   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1471   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1472   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1473   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1474   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1475   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1476   }
1477   return nullptr;
1478 }
1479 
1480 TargetLoweringBase::LegalizeTypeAction
1481 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1482   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1483       VT.getScalarType() == MVT::i1)
1484     return TypeSplitVector;
1485   if (Isv2x16VT(VT))
1486     return TypeLegal;
1487   return TargetLoweringBase::getPreferredVectorAction(VT);
1488 }
1489 
1490 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1491                                              int Enabled, int &ExtraSteps,
1492                                              bool &UseOneConst,
1493                                              bool Reciprocal) const {
1494   if (!(Enabled == ReciprocalEstimate::Enabled ||
1495         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1496     return SDValue();
1497 
1498   if (ExtraSteps == ReciprocalEstimate::Unspecified)
1499     ExtraSteps = 0;
1500 
1501   SDLoc DL(Operand);
1502   EVT VT = Operand.getValueType();
1503   bool Ftz = useF32FTZ(DAG.getMachineFunction());
1504 
1505   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1506     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1507                        DAG.getConstant(IID, DL, MVT::i32), Operand);
1508   };
1509 
1510   // The sqrt and rsqrt refinement processes assume we always start out with an
1511   // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1512   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1513   // any refinement, we must return a regular sqrt.
1514   if (Reciprocal || ExtraSteps > 0) {
1515     if (VT == MVT::f32)
1516       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1517                                    : Intrinsic::nvvm_rsqrt_approx_f);
1518     else if (VT == MVT::f64)
1519       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1520     else
1521       return SDValue();
1522   } else {
1523     if (VT == MVT::f32)
1524       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1525                                    : Intrinsic::nvvm_sqrt_approx_f);
1526     else {
1527       // There's no sqrt.approx.f64 instruction, so we emit
1528       // reciprocal(rsqrt(x)).  This is faster than
1529       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1530       // x * rsqrt(x).)
1531       return DAG.getNode(
1532           ISD::INTRINSIC_WO_CHAIN, DL, VT,
1533           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1534           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1535     }
1536   }
1537 }
1538 
1539 SDValue
1540 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1541   SDLoc dl(Op);
1542   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1543   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1544   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1545   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1546 }
1547 
1548 static bool IsTypePassedAsArray(const Type *Ty) {
1549   return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1550          Ty->isHalfTy() || Ty->isBFloatTy();
1551 }
1552 
1553 std::string NVPTXTargetLowering::getPrototype(
1554     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1555     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1556     std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1557     const CallBase &CB, unsigned UniqueCallSite) const {
1558   auto PtrVT = getPointerTy(DL);
1559 
1560   bool isABI = (STI.getSmVersion() >= 20);
1561   assert(isABI && "Non-ABI compilation is not supported");
1562   if (!isABI)
1563     return "";
1564 
1565   std::string Prototype;
1566   raw_string_ostream O(Prototype);
1567   O << "prototype_" << UniqueCallSite << " : .callprototype ";
1568 
1569   if (retTy->getTypeID() == Type::VoidTyID) {
1570     O << "()";
1571   } else {
1572     O << "(";
1573     if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1574         !IsTypePassedAsArray(retTy)) {
1575       unsigned size = 0;
1576       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1577         size = ITy->getBitWidth();
1578       } else {
1579         assert(retTy->isFloatingPointTy() &&
1580                "Floating point type expected here");
1581         size = retTy->getPrimitiveSizeInBits();
1582       }
1583       // PTX ABI requires all scalar return values to be at least 32
1584       // bits in size.  fp16 normally uses .b16 as its storage type in
1585       // PTX, so its size must be adjusted here, too.
1586       size = promoteScalarArgumentSize(size);
1587 
1588       O << ".param .b" << size << " _";
1589     } else if (isa<PointerType>(retTy)) {
1590       O << ".param .b" << PtrVT.getSizeInBits() << " _";
1591     } else if (IsTypePassedAsArray(retTy)) {
1592       O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1593         << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1594     } else {
1595       llvm_unreachable("Unknown return type");
1596     }
1597     O << ") ";
1598   }
1599   O << "_ (";
1600 
1601   bool first = true;
1602 
1603   const Function *F = CB.getFunction();
1604   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1605   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1606     Type *Ty = Args[i].Ty;
1607     if (!first) {
1608       O << ", ";
1609     }
1610     first = false;
1611 
1612     if (!Outs[OIdx].Flags.isByVal()) {
1613       if (IsTypePassedAsArray(Ty)) {
1614         unsigned ParamAlign = 0;
1615         const CallInst *CallI = cast<CallInst>(&CB);
1616         // +1 because index 0 is reserved for return type alignment
1617         if (!getAlign(*CallI, i + 1, ParamAlign))
1618           ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
1619         O << ".param .align " << ParamAlign << " .b8 ";
1620         O << "_";
1621         O << "[" << DL.getTypeAllocSize(Ty) << "]";
1622         // update the index for Outs
1623         SmallVector<EVT, 16> vtparts;
1624         ComputeValueVTs(*this, DL, Ty, vtparts);
1625         if (unsigned len = vtparts.size())
1626           OIdx += len - 1;
1627         continue;
1628       }
1629       // i8 types in IR will be i16 types in SDAG
1630       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1631               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1632              "type mismatch between callee prototype and arguments");
1633       // scalar type
1634       unsigned sz = 0;
1635       if (isa<IntegerType>(Ty)) {
1636         sz = cast<IntegerType>(Ty)->getBitWidth();
1637         sz = promoteScalarArgumentSize(sz);
1638       } else if (isa<PointerType>(Ty)) {
1639         sz = PtrVT.getSizeInBits();
1640       } else {
1641         sz = Ty->getPrimitiveSizeInBits();
1642       }
1643       O << ".param .b" << sz << " ";
1644       O << "_";
1645       continue;
1646     }
1647 
1648     Type *ETy = Args[i].IndirectType;
1649     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1650     Align ParamByValAlign =
1651         getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1652 
1653     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1654     O << "_";
1655     O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1656   }
1657 
1658   if (VAInfo)
1659     O << (first ? "" : ",") << " .param .align " << VAInfo->second
1660       << " .b8 _[]\n";
1661   O << ")";
1662   if (shouldEmitPTXNoReturn(&CB, *nvTM))
1663     O << " .noreturn";
1664   O << ";";
1665 
1666   return Prototype;
1667 }
1668 
1669 Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1670                                                 unsigned Idx,
1671                                                 const DataLayout &DL) const {
1672   if (!CB) {
1673     // CallSite is zero, fallback to ABI type alignment
1674     return DL.getABITypeAlign(Ty);
1675   }
1676 
1677   unsigned Alignment = 0;
1678   const Function *DirectCallee = CB->getCalledFunction();
1679 
1680   if (!DirectCallee) {
1681     // We don't have a direct function symbol, but that may be because of
1682     // constant cast instructions in the call.
1683 
1684     // With bitcast'd call targets, the instruction will be the call
1685     if (const auto *CI = dyn_cast<CallInst>(CB)) {
1686       // Check if we have call alignment metadata
1687       if (getAlign(*CI, Idx, Alignment))
1688         return Align(Alignment);
1689     }
1690     DirectCallee = getMaybeBitcastedCallee(CB);
1691   }
1692 
1693   // Check for function alignment information if we found that the
1694   // ultimate target is a Function
1695   if (DirectCallee) {
1696     if (getAlign(*DirectCallee, Idx, Alignment))
1697       return Align(Alignment);
1698     // If alignment information is not available, fall back to the
1699     // default function param optimized type alignment
1700     return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
1701   }
1702 
1703   // Call is indirect, fall back to the ABI type alignment
1704   return DL.getABITypeAlign(Ty);
1705 }
1706 
1707 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1708                                        SmallVectorImpl<SDValue> &InVals) const {
1709 
1710   if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1711     report_fatal_error(
1712         "Support for variadic functions (unsized array parameter) introduced "
1713         "in PTX ISA version 6.0 and requires target sm_30.");
1714 
1715   SelectionDAG &DAG = CLI.DAG;
1716   SDLoc dl = CLI.DL;
1717   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1718   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1719   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1720   SDValue Chain = CLI.Chain;
1721   SDValue Callee = CLI.Callee;
1722   bool &isTailCall = CLI.IsTailCall;
1723   ArgListTy &Args = CLI.getArgs();
1724   Type *RetTy = CLI.RetTy;
1725   const CallBase *CB = CLI.CB;
1726   const DataLayout &DL = DAG.getDataLayout();
1727 
1728   bool isABI = (STI.getSmVersion() >= 20);
1729   assert(isABI && "Non-ABI compilation is not supported");
1730   if (!isABI)
1731     return Chain;
1732 
1733   // Variadic arguments.
1734   //
1735   // Normally, for each argument, we declare a param scalar or a param
1736   // byte array in the .param space, and store the argument value to that
1737   // param scalar or array starting at offset 0.
1738   //
1739   // In the case of the first variadic argument, we declare a vararg byte array
1740   // with size 0. The exact size of this array isn't known at this point, so
1741   // it'll be patched later. All the variadic arguments will be stored to this
1742   // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1743   // initially set to 0, so it can be used for non-variadic arguments (which use
1744   // 0 offset) to simplify the code.
1745   //
1746   // After all vararg is processed, 'VAOffset' holds the size of the
1747   // vararg byte array.
1748 
1749   SDValue VADeclareParam;                 // vararg byte array
1750   unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1751   unsigned VAOffset = 0;                  // current offset in the param array
1752 
1753   unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1754   SDValue TempChain = Chain;
1755   Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1756   SDValue InGlue = Chain.getValue(1);
1757 
1758   unsigned ParamCount = 0;
1759   // Args.size() and Outs.size() need not match.
1760   // Outs.size() will be larger
1761   //   * if there is an aggregate argument with multiple fields (each field
1762   //     showing up separately in Outs)
1763   //   * if there is a vector argument with more than typical vector-length
1764   //     elements (generally if more than 4) where each vector element is
1765   //     individually present in Outs.
1766   // So a different index should be used for indexing into Outs/OutVals.
1767   // See similar issue in LowerFormalArguments.
1768   unsigned OIdx = 0;
1769   // Declare the .params or .reg need to pass values
1770   // to the function
1771   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1772     EVT VT = Outs[OIdx].VT;
1773     Type *Ty = Args[i].Ty;
1774     bool IsVAArg = (i >= CLI.NumFixedArgs);
1775     bool IsByVal = Outs[OIdx].Flags.isByVal();
1776 
1777     SmallVector<EVT, 16> VTs;
1778     SmallVector<uint64_t, 16> Offsets;
1779 
1780     assert((!IsByVal || Args[i].IndirectType) &&
1781            "byval arg must have indirect type");
1782     Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1783     ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1784 
1785     Align ArgAlign;
1786     if (IsByVal) {
1787       // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1788       // so we don't need to worry whether it's naturally aligned or not.
1789       // See TargetLowering::LowerCallTo().
1790       Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1791       ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1792                                             InitialAlign, DL);
1793       if (IsVAArg)
1794         VAOffset = alignTo(VAOffset, ArgAlign);
1795     } else {
1796       ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1797     }
1798 
1799     unsigned TypeSize =
1800         (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1801     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1802 
1803     bool NeedAlign; // Does argument declaration specify alignment?
1804     bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1805     if (IsVAArg) {
1806       if (ParamCount == FirstVAArg) {
1807         SDValue DeclareParamOps[] = {
1808             Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1809             DAG.getConstant(ParamCount, dl, MVT::i32),
1810             DAG.getConstant(1, dl, MVT::i32), InGlue};
1811         VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1812                                              DeclareParamVTs, DeclareParamOps);
1813       }
1814       NeedAlign = PassAsArray;
1815     } else if (PassAsArray) {
1816       // declare .param .align <align> .b8 .param<n>[<size>];
1817       SDValue DeclareParamOps[] = {
1818           Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1819           DAG.getConstant(ParamCount, dl, MVT::i32),
1820           DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1821       Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1822                           DeclareParamOps);
1823       NeedAlign = true;
1824     } else {
1825       // declare .param .b<size> .param<n>;
1826       if (VT.isInteger() || VT.isFloatingPoint()) {
1827         // PTX ABI requires integral types to be at least 32 bits in
1828         // size. FP16 is loaded/stored using i16, so it's handled
1829         // here as well.
1830         TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1831       }
1832       SDValue DeclareScalarParamOps[] = {
1833           Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1834           DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1835           DAG.getConstant(0, dl, MVT::i32), InGlue};
1836       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1837                           DeclareScalarParamOps);
1838       NeedAlign = false;
1839     }
1840     InGlue = Chain.getValue(1);
1841 
1842     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1843     // than 32-bits are sign extended or zero extended, depending on
1844     // whether they are signed or unsigned types. This case applies
1845     // only to scalar parameters and not to aggregate values.
1846     bool ExtendIntegerParam =
1847         Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1848 
1849     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1850     SmallVector<SDValue, 6> StoreOperands;
1851     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1852       EVT EltVT = VTs[j];
1853       int CurOffset = Offsets[j];
1854       MaybeAlign PartAlign;
1855       if (NeedAlign)
1856         PartAlign = commonAlignment(ArgAlign, CurOffset);
1857 
1858       // New store.
1859       if (VectorInfo[j] & PVF_FIRST) {
1860         assert(StoreOperands.empty() && "Unfinished preceding store.");
1861         StoreOperands.push_back(Chain);
1862         StoreOperands.push_back(
1863             DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1864         StoreOperands.push_back(DAG.getConstant(
1865             IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1866             dl, MVT::i32));
1867       }
1868 
1869       SDValue StVal = OutVals[OIdx];
1870 
1871       MVT PromotedVT;
1872       if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1873         EltVT = EVT(PromotedVT);
1874       }
1875       if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1876         llvm::ISD::NodeType Ext =
1877             Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1878         StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1879       }
1880 
1881       if (IsByVal) {
1882         auto PtrVT = getPointerTy(DL);
1883         SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1884                                       DAG.getConstant(CurOffset, dl, PtrVT));
1885         StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1886                             PartAlign);
1887       } else if (ExtendIntegerParam) {
1888         assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1889         // zext/sext to i32
1890         StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1891                                                       : ISD::ZERO_EXTEND,
1892                             dl, MVT::i32, StVal);
1893       }
1894 
1895       if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1896         // Use 16-bit registers for small stores as it's the
1897         // smallest general purpose register size supported by NVPTX.
1898         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1899       }
1900 
1901       // Record the value to store.
1902       StoreOperands.push_back(StVal);
1903 
1904       if (VectorInfo[j] & PVF_LAST) {
1905         unsigned NumElts = StoreOperands.size() - 3;
1906         NVPTXISD::NodeType Op;
1907         switch (NumElts) {
1908         case 1:
1909           Op = NVPTXISD::StoreParam;
1910           break;
1911         case 2:
1912           Op = NVPTXISD::StoreParamV2;
1913           break;
1914         case 4:
1915           Op = NVPTXISD::StoreParamV4;
1916           break;
1917         default:
1918           llvm_unreachable("Invalid vector info.");
1919         }
1920 
1921         StoreOperands.push_back(InGlue);
1922 
1923         // Adjust type of the store op if we've extended the scalar
1924         // return value.
1925         EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1926 
1927         Chain = DAG.getMemIntrinsicNode(
1928             Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1929             TheStoreType, MachinePointerInfo(), PartAlign,
1930             MachineMemOperand::MOStore);
1931         InGlue = Chain.getValue(1);
1932 
1933         // Cleanup.
1934         StoreOperands.clear();
1935 
1936         // TODO: We may need to support vector types that can be passed
1937         // as scalars in variadic arguments.
1938         if (!IsByVal && IsVAArg) {
1939           assert(NumElts == 1 &&
1940                  "Vectorization is expected to be disabled for variadics.");
1941           VAOffset += DL.getTypeAllocSize(
1942               TheStoreType.getTypeForEVT(*DAG.getContext()));
1943         }
1944       }
1945       if (!IsByVal)
1946         ++OIdx;
1947     }
1948     assert(StoreOperands.empty() && "Unfinished parameter store.");
1949     if (!IsByVal && VTs.size() > 0)
1950       --OIdx;
1951     ++ParamCount;
1952     if (IsByVal && IsVAArg)
1953       VAOffset += TypeSize;
1954   }
1955 
1956   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1957   MaybeAlign retAlignment = std::nullopt;
1958 
1959   // Handle Result
1960   if (Ins.size() > 0) {
1961     SmallVector<EVT, 16> resvtparts;
1962     ComputeValueVTs(*this, DL, RetTy, resvtparts);
1963 
1964     // Declare
1965     //  .param .align N .b8 retval0[<size-in-bytes>], or
1966     //  .param .b<size-in-bits> retval0
1967     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1968     if (!IsTypePassedAsArray(RetTy)) {
1969       resultsz = promoteScalarArgumentSize(resultsz);
1970       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1971       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1972                                   DAG.getConstant(resultsz, dl, MVT::i32),
1973                                   DAG.getConstant(0, dl, MVT::i32), InGlue };
1974       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1975                           DeclareRetOps);
1976       InGlue = Chain.getValue(1);
1977     } else {
1978       retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1979       assert(retAlignment && "retAlignment is guaranteed to be set");
1980       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1981       SDValue DeclareRetOps[] = {
1982           Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1983           DAG.getConstant(resultsz / 8, dl, MVT::i32),
1984           DAG.getConstant(0, dl, MVT::i32), InGlue};
1985       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1986                           DeclareRetOps);
1987       InGlue = Chain.getValue(1);
1988     }
1989   }
1990 
1991   bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1992   // Set the size of the vararg param byte array if the callee is a variadic
1993   // function and the variadic part is not empty.
1994   if (HasVAArgs) {
1995     SDValue DeclareParamOps[] = {
1996         VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1997         VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1998         VADeclareParam.getOperand(4)};
1999     DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
2000                     VADeclareParam->getVTList(), DeclareParamOps);
2001   }
2002 
2003   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
2004   // between them we must rely on the call site value which is valid for
2005   // indirect calls but is always null for libcalls.
2006   bool isIndirectCall = !Func && CB;
2007 
2008   if (isa<ExternalSymbolSDNode>(Callee)) {
2009     Function* CalleeFunc = nullptr;
2010 
2011     // Try to find the callee in the current module.
2012     Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
2013     assert(CalleeFunc != nullptr && "Libcall callee must be set.");
2014 
2015     // Set the "libcall callee" attribute to indicate that the function
2016     // must always have a declaration.
2017     CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
2018   }
2019 
2020   if (isIndirectCall) {
2021     // This is indirect function call case : PTX requires a prototype of the
2022     // form
2023     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
2024     // to be emitted, and the label has to used as the last arg of call
2025     // instruction.
2026     // The prototype is embedded in a string and put as the operand for a
2027     // CallPrototype SDNode which will print out to the value of the string.
2028     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2029     std::string Proto = getPrototype(
2030         DL, RetTy, Args, Outs, retAlignment,
2031         HasVAArgs
2032             ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
2033                   CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
2034             : std::nullopt,
2035         *CB, UniqueCallSite);
2036     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
2037     SDValue ProtoOps[] = {
2038         Chain,
2039         DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
2040         InGlue,
2041     };
2042     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
2043     InGlue = Chain.getValue(1);
2044   }
2045   // Op to just print "call"
2046   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2047   SDValue PrintCallOps[] = {
2048     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2049   };
2050   // We model convergent calls as separate opcodes.
2051   unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
2052   if (CLI.IsConvergent)
2053     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2054                                               : NVPTXISD::PrintConvergentCall;
2055   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2056   InGlue = Chain.getValue(1);
2057 
2058   // Ops to print out the function name
2059   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2060   SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2061   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2062   InGlue = Chain.getValue(1);
2063 
2064   // Ops to print out the param list
2065   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2066   SDValue CallArgBeginOps[] = { Chain, InGlue };
2067   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2068                       CallArgBeginOps);
2069   InGlue = Chain.getValue(1);
2070 
2071   for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2072        ++i) {
2073     unsigned opcode;
2074     if (i == (e - 1))
2075       opcode = NVPTXISD::LastCallArg;
2076     else
2077       opcode = NVPTXISD::CallArg;
2078     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2079     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2080                              DAG.getConstant(i, dl, MVT::i32), InGlue };
2081     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2082     InGlue = Chain.getValue(1);
2083   }
2084   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2085   SDValue CallArgEndOps[] = { Chain,
2086                               DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2087                               InGlue };
2088   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2089   InGlue = Chain.getValue(1);
2090 
2091   if (isIndirectCall) {
2092     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2093     SDValue PrototypeOps[] = {
2094         Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2095     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2096     InGlue = Chain.getValue(1);
2097   }
2098 
2099   SmallVector<SDValue, 16> ProxyRegOps;
2100   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2101 
2102   // Generate loads from param memory/moves from registers for result
2103   if (Ins.size() > 0) {
2104     SmallVector<EVT, 16> VTs;
2105     SmallVector<uint64_t, 16> Offsets;
2106     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2107     assert(VTs.size() == Ins.size() && "Bad value decomposition");
2108 
2109     Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2110     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2111 
2112     SmallVector<EVT, 6> LoadVTs;
2113     int VecIdx = -1; // Index of the first element of the vector.
2114 
2115     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2116     // 32-bits are sign extended or zero extended, depending on whether
2117     // they are signed or unsigned types.
2118     bool ExtendIntegerRetVal =
2119         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2120 
2121     for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2122       bool needTruncate = false;
2123       EVT TheLoadType = VTs[i];
2124       EVT EltType = Ins[i].VT;
2125       Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2126       MVT PromotedVT;
2127 
2128       if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2129         TheLoadType = EVT(PromotedVT);
2130         EltType = EVT(PromotedVT);
2131         needTruncate = true;
2132       }
2133 
2134       if (ExtendIntegerRetVal) {
2135         TheLoadType = MVT::i32;
2136         EltType = MVT::i32;
2137         needTruncate = true;
2138       } else if (TheLoadType.getSizeInBits() < 16) {
2139         if (VTs[i].isInteger())
2140           needTruncate = true;
2141         EltType = MVT::i16;
2142       }
2143 
2144       // Record index of the very first element of the vector.
2145       if (VectorInfo[i] & PVF_FIRST) {
2146         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2147         VecIdx = i;
2148       }
2149 
2150       LoadVTs.push_back(EltType);
2151 
2152       if (VectorInfo[i] & PVF_LAST) {
2153         unsigned NumElts = LoadVTs.size();
2154         LoadVTs.push_back(MVT::Other);
2155         LoadVTs.push_back(MVT::Glue);
2156         NVPTXISD::NodeType Op;
2157         switch (NumElts) {
2158         case 1:
2159           Op = NVPTXISD::LoadParam;
2160           break;
2161         case 2:
2162           Op = NVPTXISD::LoadParamV2;
2163           break;
2164         case 4:
2165           Op = NVPTXISD::LoadParamV4;
2166           break;
2167         default:
2168           llvm_unreachable("Invalid vector info.");
2169         }
2170 
2171         SDValue LoadOperands[] = {
2172             Chain, DAG.getConstant(1, dl, MVT::i32),
2173             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2174         SDValue RetVal = DAG.getMemIntrinsicNode(
2175             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2176             MachinePointerInfo(), EltAlign,
2177             MachineMemOperand::MOLoad);
2178 
2179         for (unsigned j = 0; j < NumElts; ++j) {
2180           ProxyRegOps.push_back(RetVal.getValue(j));
2181 
2182           if (needTruncate)
2183             ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2184           else
2185             ProxyRegTruncates.push_back(std::optional<MVT>());
2186         }
2187 
2188         Chain = RetVal.getValue(NumElts);
2189         InGlue = RetVal.getValue(NumElts + 1);
2190 
2191         // Cleanup
2192         VecIdx = -1;
2193         LoadVTs.clear();
2194       }
2195     }
2196   }
2197 
2198   Chain =
2199       DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2200   InGlue = Chain.getValue(1);
2201 
2202   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2203   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2204   // dangling.
2205   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2206     SDValue Ret = DAG.getNode(
2207       NVPTXISD::ProxyReg, dl,
2208       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2209       { Chain, ProxyRegOps[i], InGlue }
2210     );
2211 
2212     Chain = Ret.getValue(1);
2213     InGlue = Ret.getValue(2);
2214 
2215     if (ProxyRegTruncates[i]) {
2216       Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2217     }
2218 
2219     InVals.push_back(Ret);
2220   }
2221 
2222   // set isTailCall to false for now, until we figure out how to express
2223   // tail call optimization in PTX
2224   isTailCall = false;
2225   return Chain;
2226 }
2227 
2228 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2229                                                      SelectionDAG &DAG) const {
2230   const Function &Fn = DAG.getMachineFunction().getFunction();
2231 
2232   DiagnosticInfoUnsupported NoDynamicAlloca(
2233       Fn, "dynamic alloca unsupported by NVPTX backend",
2234       SDLoc(Op).getDebugLoc());
2235   DAG.getContext()->diagnose(NoDynamicAlloca);
2236   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
2237   return DAG.getMergeValues(Ops, SDLoc());
2238 }
2239 
2240 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2241 // (see LegalizeDAG.cpp). This is slow and uses local memory.
2242 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2243 SDValue
2244 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2245   SDNode *Node = Op.getNode();
2246   SDLoc dl(Node);
2247   SmallVector<SDValue, 8> Ops;
2248   unsigned NumOperands = Node->getNumOperands();
2249   for (unsigned i = 0; i < NumOperands; ++i) {
2250     SDValue SubOp = Node->getOperand(i);
2251     EVT VVT = SubOp.getNode()->getValueType(0);
2252     EVT EltVT = VVT.getVectorElementType();
2253     unsigned NumSubElem = VVT.getVectorNumElements();
2254     for (unsigned j = 0; j < NumSubElem; ++j) {
2255       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2256                                 DAG.getIntPtrConstant(j, dl)));
2257     }
2258   }
2259   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2260 }
2261 
2262 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
2263 // would get lowered as two constant loads and vector-packing move.
2264 // Instead we want just a constant move:
2265 //        mov.b32         %r2, 0x40003C00
2266 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2267                                                SelectionDAG &DAG) const {
2268   EVT VT = Op->getValueType(0);
2269   if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2270     return Op;
2271 
2272   SDLoc DL(Op);
2273 
2274   if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2275         return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2276                isa<ConstantFPSDNode>(Operand);
2277       })) {
2278     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2279     // to optimize calculation of constant parts.
2280     if (VT == MVT::v4i8) {
2281       SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2282       SDValue E01 = DAG.getNode(
2283           NVPTXISD::BFI, DL, MVT::i32,
2284           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2285           DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2286       SDValue E012 =
2287           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2288                       DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2289                       E01, DAG.getConstant(16, DL, MVT::i32), C8);
2290       SDValue E0123 =
2291           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2292                       DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2293                       E012, DAG.getConstant(24, DL, MVT::i32), C8);
2294       return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2295     }
2296     return Op;
2297   }
2298 
2299   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2300   auto GetOperand = [](SDValue Op, int N) -> APInt {
2301     const SDValue &Operand = Op->getOperand(N);
2302     EVT VT = Op->getValueType(0);
2303     if (Operand->isUndef())
2304       return APInt(32, 0);
2305     APInt Value;
2306     if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2307       Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2308     else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2309       Value = Operand->getAsAPIntVal();
2310     else
2311       llvm_unreachable("Unsupported type");
2312     // i8 values are carried around as i16, so we need to zero out upper bits,
2313     // so they do not get in the way of combining individual byte values
2314     if (VT == MVT::v4i8)
2315       Value = Value.trunc(8);
2316     return Value.zext(32);
2317   };
2318   APInt Value;
2319   if (Isv2x16VT(VT)) {
2320     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2321   } else if (VT == MVT::v4i8) {
2322     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2323             GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2324   } else {
2325     llvm_unreachable("Unsupported type");
2326   }
2327   SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2328   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2329 }
2330 
2331 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2332                                                      SelectionDAG &DAG) const {
2333   SDValue Index = Op->getOperand(1);
2334   SDValue Vector = Op->getOperand(0);
2335   SDLoc DL(Op);
2336   EVT VectorVT = Vector.getValueType();
2337 
2338   if (VectorVT == MVT::v4i8) {
2339     SDValue BFE =
2340         DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2341                     {Vector,
2342                      DAG.getNode(ISD::MUL, DL, MVT::i32,
2343                                  DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2344                                  DAG.getConstant(8, DL, MVT::i32)),
2345                      DAG.getConstant(8, DL, MVT::i32)});
2346     return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2347   }
2348 
2349   // Constant index will be matched by tablegen.
2350   if (isa<ConstantSDNode>(Index.getNode()))
2351     return Op;
2352 
2353   // Extract individual elements and select one of them.
2354   assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2355   EVT EltVT = VectorVT.getVectorElementType();
2356 
2357   SDLoc dl(Op.getNode());
2358   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2359                            DAG.getIntPtrConstant(0, dl));
2360   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2361                            DAG.getIntPtrConstant(1, dl));
2362   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2363                          ISD::CondCode::SETEQ);
2364 }
2365 
2366 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2367                                                     SelectionDAG &DAG) const {
2368   SDValue Vector = Op->getOperand(0);
2369   EVT VectorVT = Vector.getValueType();
2370 
2371   if (VectorVT != MVT::v4i8)
2372     return Op;
2373   SDLoc DL(Op);
2374   SDValue Value = Op->getOperand(1);
2375   if (Value->isUndef())
2376     return Vector;
2377 
2378   SDValue Index = Op->getOperand(2);
2379 
2380   SDValue BFI =
2381       DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2382                   {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2383                    DAG.getNode(ISD::MUL, DL, MVT::i32,
2384                                DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2385                                DAG.getConstant(8, DL, MVT::i32)),
2386                    DAG.getConstant(8, DL, MVT::i32)});
2387   return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2388 }
2389 
2390 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2391                                                  SelectionDAG &DAG) const {
2392   SDValue V1 = Op.getOperand(0);
2393   EVT VectorVT = V1.getValueType();
2394   if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2395     return Op;
2396 
2397   // Lower shuffle to PRMT instruction.
2398   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2399   SDValue V2 = Op.getOperand(1);
2400   uint32_t Selector = 0;
2401   for (auto I : llvm::enumerate(SVN->getMask())) {
2402     if (I.value() != -1) // -1 is a placeholder for undef.
2403       Selector |= (I.value() << (I.index() * 4));
2404   }
2405 
2406   SDLoc DL(Op);
2407   return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2408                      DAG.getConstant(Selector, DL, MVT::i32),
2409                      DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2410 }
2411 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2412 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2413 ///    amount, or
2414 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2415 ///    amount.
2416 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2417                                                   SelectionDAG &DAG) const {
2418   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2419   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2420 
2421   EVT VT = Op.getValueType();
2422   unsigned VTBits = VT.getSizeInBits();
2423   SDLoc dl(Op);
2424   SDValue ShOpLo = Op.getOperand(0);
2425   SDValue ShOpHi = Op.getOperand(1);
2426   SDValue ShAmt  = Op.getOperand(2);
2427   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2428 
2429   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2430     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2431     // {dHi, dLo} = {aHi, aLo} >> Amt
2432     //   dHi = aHi >> Amt
2433     //   dLo = shf.r.clamp aLo, aHi, Amt
2434 
2435     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2436     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2437                              ShAmt);
2438 
2439     SDValue Ops[2] = { Lo, Hi };
2440     return DAG.getMergeValues(Ops, dl);
2441   }
2442   else {
2443     // {dHi, dLo} = {aHi, aLo} >> Amt
2444     // - if (Amt>=size) then
2445     //      dLo = aHi >> (Amt-size)
2446     //      dHi = aHi >> Amt (this is either all 0 or all 1)
2447     //   else
2448     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2449     //      dHi = aHi >> Amt
2450 
2451     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2452                                    DAG.getConstant(VTBits, dl, MVT::i32),
2453                                    ShAmt);
2454     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2455     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2456                                      DAG.getConstant(VTBits, dl, MVT::i32));
2457     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2458     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2459     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2460 
2461     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2462                                DAG.getConstant(VTBits, dl, MVT::i32),
2463                                ISD::SETGE);
2464     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2465     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2466 
2467     SDValue Ops[2] = { Lo, Hi };
2468     return DAG.getMergeValues(Ops, dl);
2469   }
2470 }
2471 
2472 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2473 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2474 ///    amount, or
2475 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2476 ///    amount.
2477 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2478                                                  SelectionDAG &DAG) const {
2479   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2480   assert(Op.getOpcode() == ISD::SHL_PARTS);
2481 
2482   EVT VT = Op.getValueType();
2483   unsigned VTBits = VT.getSizeInBits();
2484   SDLoc dl(Op);
2485   SDValue ShOpLo = Op.getOperand(0);
2486   SDValue ShOpHi = Op.getOperand(1);
2487   SDValue ShAmt  = Op.getOperand(2);
2488 
2489   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2490     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2491     // {dHi, dLo} = {aHi, aLo} << Amt
2492     //   dHi = shf.l.clamp aLo, aHi, Amt
2493     //   dLo = aLo << Amt
2494 
2495     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2496                              ShAmt);
2497     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2498 
2499     SDValue Ops[2] = { Lo, Hi };
2500     return DAG.getMergeValues(Ops, dl);
2501   }
2502   else {
2503     // {dHi, dLo} = {aHi, aLo} << Amt
2504     // - if (Amt>=size) then
2505     //      dLo = aLo << Amt (all 0)
2506     //      dLo = aLo << (Amt-size)
2507     //   else
2508     //      dLo = aLo << Amt
2509     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2510 
2511     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2512                                    DAG.getConstant(VTBits, dl, MVT::i32),
2513                                    ShAmt);
2514     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2515     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2516                                      DAG.getConstant(VTBits, dl, MVT::i32));
2517     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2518     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2519     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2520 
2521     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2522                                DAG.getConstant(VTBits, dl, MVT::i32),
2523                                ISD::SETGE);
2524     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2525     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2526 
2527     SDValue Ops[2] = { Lo, Hi };
2528     return DAG.getMergeValues(Ops, dl);
2529   }
2530 }
2531 
2532 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2533   EVT VT = Op.getValueType();
2534 
2535   if (VT == MVT::f32)
2536     return LowerFROUND32(Op, DAG);
2537 
2538   if (VT == MVT::f64)
2539     return LowerFROUND64(Op, DAG);
2540 
2541   llvm_unreachable("unhandled type");
2542 }
2543 
2544 // This is the the rounding method used in CUDA libdevice in C like code:
2545 // float roundf(float A)
2546 // {
2547 //   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2548 //   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2549 //   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2550 // }
2551 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2552                                            SelectionDAG &DAG) const {
2553   SDLoc SL(Op);
2554   SDValue A = Op.getOperand(0);
2555   EVT VT = Op.getValueType();
2556 
2557   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2558 
2559   // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2560   SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2561   const int SignBitMask = 0x80000000;
2562   SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2563                              DAG.getConstant(SignBitMask, SL, MVT::i32));
2564   const int PointFiveInBits = 0x3F000000;
2565   SDValue PointFiveWithSignRaw =
2566       DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2567                   DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2568   SDValue PointFiveWithSign =
2569       DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2570   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2571   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2572 
2573   // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2574   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2575   SDValue IsLarge =
2576       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2577                    ISD::SETOGT);
2578   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2579 
2580   // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2581   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2582                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2583   SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2584   return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2585 }
2586 
2587 // The implementation of round(double) is similar to that of round(float) in
2588 // that they both separate the value range into three regions and use a method
2589 // specific to the region to round the values. However, round(double) first
2590 // calculates the round of the absolute value and then adds the sign back while
2591 // round(float) directly rounds the value with sign.
2592 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2593                                            SelectionDAG &DAG) const {
2594   SDLoc SL(Op);
2595   SDValue A = Op.getOperand(0);
2596   EVT VT = Op.getValueType();
2597 
2598   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2599 
2600   // double RoundedA = (double) (int) (abs(A) + 0.5f);
2601   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2602                                   DAG.getConstantFP(0.5, SL, VT));
2603   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2604 
2605   // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2606   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2607   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2608                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2609   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2610                          DAG.getConstantFP(0, SL, VT),
2611                          RoundedA);
2612 
2613   // Add sign to rounded_A
2614   RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2615   DAG.getNode(ISD::FTRUNC, SL, VT, A);
2616 
2617   // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2618   SDValue IsLarge =
2619       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2620                    ISD::SETOGT);
2621   return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2622 }
2623 
2624 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2625                                             SelectionDAG &DAG) const {
2626   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2627 
2628   if (Op.getValueType() == MVT::bf16) {
2629     SDLoc Loc(Op);
2630     return DAG.getNode(
2631         ISD::FP_ROUND, Loc, MVT::bf16,
2632         DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2633         DAG.getIntPtrConstant(0, Loc));
2634   }
2635 
2636   // Everything else is considered legal.
2637   return Op;
2638 }
2639 
2640 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2641                                             SelectionDAG &DAG) const {
2642   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2643 
2644   if (Op.getOperand(0).getValueType() == MVT::bf16) {
2645     SDLoc Loc(Op);
2646     return DAG.getNode(
2647         Op.getOpcode(), Loc, Op.getValueType(),
2648         DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2649   }
2650 
2651   // Everything else is considered legal.
2652   return Op;
2653 }
2654 
2655 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2656   SDLoc DL(Op);
2657   if (Op.getValueType() != MVT::v2i16)
2658     return Op;
2659   EVT EltVT = Op.getValueType().getVectorElementType();
2660   SmallVector<SDValue> VecElements;
2661   for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2662     SmallVector<SDValue> ScalarArgs;
2663     llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2664                     [&](const SDUse &O) {
2665                       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2666                                          O.get(), DAG.getIntPtrConstant(I, DL));
2667                     });
2668     VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2669   }
2670   SDValue V =
2671       DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2672   return V;
2673 }
2674 
2675 SDValue
2676 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2677   switch (Op.getOpcode()) {
2678   case ISD::RETURNADDR:
2679     return SDValue();
2680   case ISD::FRAMEADDR:
2681     return SDValue();
2682   case ISD::GlobalAddress:
2683     return LowerGlobalAddress(Op, DAG);
2684   case ISD::INTRINSIC_W_CHAIN:
2685     return Op;
2686   case ISD::BUILD_VECTOR:
2687     return LowerBUILD_VECTOR(Op, DAG);
2688   case ISD::EXTRACT_SUBVECTOR:
2689     return Op;
2690   case ISD::EXTRACT_VECTOR_ELT:
2691     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2692   case ISD::INSERT_VECTOR_ELT:
2693     return LowerINSERT_VECTOR_ELT(Op, DAG);
2694   case ISD::VECTOR_SHUFFLE:
2695     return LowerVECTOR_SHUFFLE(Op, DAG);
2696   case ISD::CONCAT_VECTORS:
2697     return LowerCONCAT_VECTORS(Op, DAG);
2698   case ISD::STORE:
2699     return LowerSTORE(Op, DAG);
2700   case ISD::LOAD:
2701     return LowerLOAD(Op, DAG);
2702   case ISD::SHL_PARTS:
2703     return LowerShiftLeftParts(Op, DAG);
2704   case ISD::SRA_PARTS:
2705   case ISD::SRL_PARTS:
2706     return LowerShiftRightParts(Op, DAG);
2707   case ISD::SELECT:
2708     return LowerSelect(Op, DAG);
2709   case ISD::FROUND:
2710     return LowerFROUND(Op, DAG);
2711   case ISD::SINT_TO_FP:
2712   case ISD::UINT_TO_FP:
2713     return LowerINT_TO_FP(Op, DAG);
2714   case ISD::FP_TO_SINT:
2715   case ISD::FP_TO_UINT:
2716     return LowerFP_TO_INT(Op, DAG);
2717   case ISD::VAARG:
2718     return LowerVAARG(Op, DAG);
2719   case ISD::VASTART:
2720     return LowerVASTART(Op, DAG);
2721   case ISD::ABS:
2722   case ISD::SMIN:
2723   case ISD::SMAX:
2724   case ISD::UMIN:
2725   case ISD::UMAX:
2726   case ISD::ADD:
2727   case ISD::SUB:
2728   case ISD::MUL:
2729   case ISD::SHL:
2730   case ISD::SREM:
2731   case ISD::UREM:
2732     return LowerVectorArith(Op, DAG);
2733   case ISD::DYNAMIC_STACKALLOC:
2734     return LowerDYNAMIC_STACKALLOC(Op, DAG);
2735   default:
2736     llvm_unreachable("Custom lowering not defined for operation");
2737   }
2738 }
2739 
2740 // This function is almost a copy of SelectionDAG::expandVAArg().
2741 // The only diff is that this one produces loads from local address space.
2742 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2743   const TargetLowering *TLI = STI.getTargetLowering();
2744   SDLoc DL(Op);
2745 
2746   SDNode *Node = Op.getNode();
2747   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2748   EVT VT = Node->getValueType(0);
2749   auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2750   SDValue Tmp1 = Node->getOperand(0);
2751   SDValue Tmp2 = Node->getOperand(1);
2752   const MaybeAlign MA(Node->getConstantOperandVal(3));
2753 
2754   SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2755                                    Tmp1, Tmp2, MachinePointerInfo(V));
2756   SDValue VAList = VAListLoad;
2757 
2758   if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2759     VAList = DAG.getNode(
2760         ISD::ADD, DL, VAList.getValueType(), VAList,
2761         DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2762 
2763     VAList = DAG.getNode(
2764         ISD::AND, DL, VAList.getValueType(), VAList,
2765         DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2766   }
2767 
2768   // Increment the pointer, VAList, to the next vaarg
2769   Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2770                      DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2771                                      DL, VAList.getValueType()));
2772 
2773   // Store the incremented VAList to the legalized pointer
2774   Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2775                       MachinePointerInfo(V));
2776 
2777   const Value *SrcV =
2778       Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2779 
2780   // Load the actual argument out of the pointer VAList
2781   return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2782 }
2783 
2784 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2785   const TargetLowering *TLI = STI.getTargetLowering();
2786   SDLoc DL(Op);
2787   EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2788 
2789   // Store the address of unsized array <function>_vararg[] in the ap object.
2790   SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2791   SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2792 
2793   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2794   return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2795                       MachinePointerInfo(SV));
2796 }
2797 
2798 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2799   SDValue Op0 = Op->getOperand(0);
2800   SDValue Op1 = Op->getOperand(1);
2801   SDValue Op2 = Op->getOperand(2);
2802   SDLoc DL(Op.getNode());
2803 
2804   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2805 
2806   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2807   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2808   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2809   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2810 
2811   return Trunc;
2812 }
2813 
2814 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2815   if (Op.getValueType() == MVT::i1)
2816     return LowerLOADi1(Op, DAG);
2817 
2818   // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2819   // unaligned loads and have to handle it here.
2820   EVT VT = Op.getValueType();
2821   if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2822     LoadSDNode *Load = cast<LoadSDNode>(Op);
2823     EVT MemVT = Load->getMemoryVT();
2824     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2825                                         MemVT, *Load->getMemOperand())) {
2826       SDValue Ops[2];
2827       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2828       return DAG.getMergeValues(Ops, SDLoc(Op));
2829     }
2830   }
2831 
2832   return SDValue();
2833 }
2834 
2835 // v = ld i1* addr
2836 //   =>
2837 // v1 = ld i8* addr (-> i16)
2838 // v = trunc i16 to i1
2839 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2840   SDNode *Node = Op.getNode();
2841   LoadSDNode *LD = cast<LoadSDNode>(Node);
2842   SDLoc dl(Node);
2843   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2844   assert(Node->getValueType(0) == MVT::i1 &&
2845          "Custom lowering for i1 load only");
2846   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2847                               LD->getPointerInfo(), LD->getAlign(),
2848                               LD->getMemOperand()->getFlags());
2849   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2850   // The legalizer (the caller) is expecting two values from the legalized
2851   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2852   // in LegalizeDAG.cpp which also uses MergeValues.
2853   SDValue Ops[] = { result, LD->getChain() };
2854   return DAG.getMergeValues(Ops, dl);
2855 }
2856 
2857 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2858   StoreSDNode *Store = cast<StoreSDNode>(Op);
2859   EVT VT = Store->getMemoryVT();
2860 
2861   if (VT == MVT::i1)
2862     return LowerSTOREi1(Op, DAG);
2863 
2864   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2865   // stores and have to handle it here.
2866   if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2867       !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2868                                       VT, *Store->getMemOperand()))
2869     return expandUnalignedStore(Store, DAG);
2870 
2871   // v2f16, v2bf16 and v2i16 don't need special handling.
2872   if (Isv2x16VT(VT) || VT == MVT::v4i8)
2873     return SDValue();
2874 
2875   if (VT.isVector())
2876     return LowerSTOREVector(Op, DAG);
2877 
2878   return SDValue();
2879 }
2880 
2881 SDValue
2882 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2883   SDNode *N = Op.getNode();
2884   SDValue Val = N->getOperand(1);
2885   SDLoc DL(N);
2886   EVT ValVT = Val.getValueType();
2887 
2888   if (ValVT.isVector()) {
2889     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2890     // legal.  We can (and should) split that into 2 stores of <2 x double> here
2891     // but I'm leaving that as a TODO for now.
2892     if (!ValVT.isSimple())
2893       return SDValue();
2894     switch (ValVT.getSimpleVT().SimpleTy) {
2895     default:
2896       return SDValue();
2897     case MVT::v2i8:
2898     case MVT::v2i16:
2899     case MVT::v2i32:
2900     case MVT::v2i64:
2901     case MVT::v2f16:
2902     case MVT::v2bf16:
2903     case MVT::v2f32:
2904     case MVT::v2f64:
2905     case MVT::v4i8:
2906     case MVT::v4i16:
2907     case MVT::v4i32:
2908     case MVT::v4f16:
2909     case MVT::v4bf16:
2910     case MVT::v4f32:
2911     case MVT::v8f16: // <4 x f16x2>
2912     case MVT::v8bf16: // <4 x bf16x2>
2913     case MVT::v8i16:  // <4 x i16x2>
2914       // This is a "native" vector type
2915       break;
2916     }
2917 
2918     MemSDNode *MemSD = cast<MemSDNode>(N);
2919     const DataLayout &TD = DAG.getDataLayout();
2920 
2921     Align Alignment = MemSD->getAlign();
2922     Align PrefAlign =
2923         TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2924     if (Alignment < PrefAlign) {
2925       // This store is not sufficiently aligned, so bail out and let this vector
2926       // store be scalarized.  Note that we may still be able to emit smaller
2927       // vector stores.  For example, if we are storing a <4 x float> with an
2928       // alignment of 8, this check will fail but the legalizer will try again
2929       // with 2 x <2 x float>, which will succeed with an alignment of 8.
2930       return SDValue();
2931     }
2932 
2933     unsigned Opcode = 0;
2934     EVT EltVT = ValVT.getVectorElementType();
2935     unsigned NumElts = ValVT.getVectorNumElements();
2936 
2937     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2938     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2939     // stored type to i16 and propagate the "real" type as the memory type.
2940     bool NeedExt = false;
2941     if (EltVT.getSizeInBits() < 16)
2942       NeedExt = true;
2943 
2944     bool StoreF16x2 = false;
2945     switch (NumElts) {
2946     default:
2947       return SDValue();
2948     case 2:
2949       Opcode = NVPTXISD::StoreV2;
2950       break;
2951     case 4:
2952       Opcode = NVPTXISD::StoreV4;
2953       break;
2954     case 8:
2955       // v8f16 is a special case. PTX doesn't have st.v8.f16
2956       // instruction. Instead, we split the vector into v2f16 chunks and
2957       // store them with st.v4.b32.
2958       assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
2959       Opcode = NVPTXISD::StoreV4;
2960       StoreF16x2 = true;
2961       break;
2962     }
2963 
2964     SmallVector<SDValue, 8> Ops;
2965 
2966     // First is the chain
2967     Ops.push_back(N->getOperand(0));
2968 
2969     if (StoreF16x2) {
2970       // Combine f16,f16 -> v2f16
2971       NumElts /= 2;
2972       for (unsigned i = 0; i < NumElts; ++i) {
2973         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2974                                  DAG.getIntPtrConstant(i * 2, DL));
2975         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2976                                  DAG.getIntPtrConstant(i * 2 + 1, DL));
2977         EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
2978         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
2979         Ops.push_back(V2);
2980       }
2981     } else {
2982       // Then the split values
2983       for (unsigned i = 0; i < NumElts; ++i) {
2984         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2985                                      DAG.getIntPtrConstant(i, DL));
2986         if (NeedExt)
2987           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2988         Ops.push_back(ExtVal);
2989       }
2990     }
2991 
2992     // Then any remaining arguments
2993     Ops.append(N->op_begin() + 2, N->op_end());
2994 
2995     SDValue NewSt =
2996         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2997                                 MemSD->getMemoryVT(), MemSD->getMemOperand());
2998 
2999     // return DCI.CombineTo(N, NewSt, true);
3000     return NewSt;
3001   }
3002 
3003   return SDValue();
3004 }
3005 
3006 // st i1 v, addr
3007 //    =>
3008 // v1 = zxt v to i16
3009 // st.u8 i16, addr
3010 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3011   SDNode *Node = Op.getNode();
3012   SDLoc dl(Node);
3013   StoreSDNode *ST = cast<StoreSDNode>(Node);
3014   SDValue Tmp1 = ST->getChain();
3015   SDValue Tmp2 = ST->getBasePtr();
3016   SDValue Tmp3 = ST->getValue();
3017   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3018   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3019   SDValue Result =
3020       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3021                         ST->getAlign(), ST->getMemOperand()->getFlags());
3022   return Result;
3023 }
3024 
3025 // This creates target external symbol for a function parameter.
3026 // Name of the symbol is composed from its index and the function name.
3027 // Negative index corresponds to special parameter (unsized array) used for
3028 // passing variable arguments.
3029 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3030                                             EVT v) const {
3031   StringRef SavedStr = nvTM->getStrPool().save(
3032       getParamName(&DAG.getMachineFunction().getFunction(), idx));
3033   return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3034 }
3035 
3036 SDValue NVPTXTargetLowering::LowerFormalArguments(
3037     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3038     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3040   MachineFunction &MF = DAG.getMachineFunction();
3041   const DataLayout &DL = DAG.getDataLayout();
3042   auto PtrVT = getPointerTy(DAG.getDataLayout());
3043 
3044   const Function *F = &MF.getFunction();
3045   const AttributeList &PAL = F->getAttributes();
3046   const TargetLowering *TLI = STI.getTargetLowering();
3047 
3048   SDValue Root = DAG.getRoot();
3049   std::vector<SDValue> OutChains;
3050 
3051   bool isABI = (STI.getSmVersion() >= 20);
3052   assert(isABI && "Non-ABI compilation is not supported");
3053   if (!isABI)
3054     return Chain;
3055 
3056   std::vector<Type *> argTypes;
3057   std::vector<const Argument *> theArgs;
3058   for (const Argument &I : F->args()) {
3059     theArgs.push_back(&I);
3060     argTypes.push_back(I.getType());
3061   }
3062   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3063   // Ins.size() will be larger
3064   //   * if there is an aggregate argument with multiple fields (each field
3065   //     showing up separately in Ins)
3066   //   * if there is a vector argument with more than typical vector-length
3067   //     elements (generally if more than 4) where each vector element is
3068   //     individually present in Ins.
3069   // So a different index should be used for indexing into Ins.
3070   // See similar issue in LowerCall.
3071   unsigned InsIdx = 0;
3072 
3073   int idx = 0;
3074   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
3075     Type *Ty = argTypes[i];
3076 
3077     if (theArgs[i]->use_empty()) {
3078       // argument is dead
3079       if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3080         SmallVector<EVT, 16> vtparts;
3081 
3082         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3083         if (vtparts.empty())
3084           report_fatal_error("Empty parameter types are not supported");
3085 
3086         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3087              ++parti) {
3088           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3089           ++InsIdx;
3090         }
3091         if (vtparts.size() > 0)
3092           --InsIdx;
3093         continue;
3094       }
3095       if (Ty->isVectorTy()) {
3096         EVT ObjectVT = getValueType(DL, Ty);
3097         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3098         for (unsigned parti = 0; parti < NumRegs; ++parti) {
3099           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3100           ++InsIdx;
3101         }
3102         if (NumRegs > 0)
3103           --InsIdx;
3104         continue;
3105       }
3106       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3107       continue;
3108     }
3109 
3110     // In the following cases, assign a node order of "idx+1"
3111     // to newly created nodes. The SDNodes for params have to
3112     // appear in the same order as their order of appearance
3113     // in the original function. "idx+1" holds that order.
3114     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3115       bool aggregateIsPacked = false;
3116       if (StructType *STy = dyn_cast<StructType>(Ty))
3117         aggregateIsPacked = STy->isPacked();
3118 
3119       SmallVector<EVT, 16> VTs;
3120       SmallVector<uint64_t, 16> Offsets;
3121       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3122       if (VTs.empty())
3123         report_fatal_error("Empty parameter types are not supported");
3124 
3125       auto VectorInfo =
3126           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
3127 
3128       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3129       int VecIdx = -1; // Index of the first element of the current vector.
3130       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3131         if (VectorInfo[parti] & PVF_FIRST) {
3132           assert(VecIdx == -1 && "Orphaned vector.");
3133           VecIdx = parti;
3134         }
3135 
3136         // That's the last element of this store op.
3137         if (VectorInfo[parti] & PVF_LAST) {
3138           unsigned NumElts = parti - VecIdx + 1;
3139           EVT EltVT = VTs[parti];
3140           // i1 is loaded/stored as i8.
3141           EVT LoadVT = EltVT;
3142           if (EltVT == MVT::i1)
3143             LoadVT = MVT::i8;
3144           else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3145             // getLoad needs a vector type, but it can't handle
3146             // vectors which contain v2f16 or v2bf16 elements. So we must load
3147             // using i32 here and then bitcast back.
3148             LoadVT = MVT::i32;
3149 
3150           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3151           SDValue VecAddr =
3152               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3153                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3154           Value *srcValue = Constant::getNullValue(PointerType::get(
3155               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3156           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3157                                   MachinePointerInfo(srcValue),
3158                                   MaybeAlign(aggregateIsPacked ? 1 : 0),
3159                                   MachineMemOperand::MODereferenceable |
3160                                       MachineMemOperand::MOInvariant);
3161           if (P.getNode())
3162             P.getNode()->setIROrder(idx + 1);
3163           for (unsigned j = 0; j < NumElts; ++j) {
3164             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3165                                       DAG.getIntPtrConstant(j, dl));
3166             // We've loaded i1 as an i8 and now must truncate it back to i1
3167             if (EltVT == MVT::i1)
3168               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3169             // v2f16 was loaded as an i32. Now we must bitcast it back.
3170             else if (EltVT != LoadVT)
3171               Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3172 
3173             // If a promoted integer type is used, truncate down to the original
3174             MVT PromotedVT;
3175             if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3176               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3177             }
3178 
3179             // Extend the element if necessary (e.g. an i8 is loaded
3180             // into an i16 register)
3181             if (Ins[InsIdx].VT.isInteger() &&
3182                 Ins[InsIdx].VT.getFixedSizeInBits() >
3183                     LoadVT.getFixedSizeInBits()) {
3184               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3185                                                            : ISD::ZERO_EXTEND;
3186               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3187             }
3188             InVals.push_back(Elt);
3189           }
3190 
3191           // Reset vector tracking state.
3192           VecIdx = -1;
3193         }
3194         ++InsIdx;
3195       }
3196       if (VTs.size() > 0)
3197         --InsIdx;
3198       continue;
3199     }
3200 
3201     // Param has ByVal attribute
3202     // Return MoveParam(param symbol).
3203     // Ideally, the param symbol can be returned directly,
3204     // but when SDNode builder decides to use it in a CopyToReg(),
3205     // machine instruction fails because TargetExternalSymbol
3206     // (not lowered) is target dependent, and CopyToReg assumes
3207     // the source is lowered.
3208     EVT ObjectVT = getValueType(DL, Ty);
3209     assert(ObjectVT == Ins[InsIdx].VT &&
3210            "Ins type did not match function type");
3211     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
3212     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3213     if (p.getNode())
3214       p.getNode()->setIROrder(idx + 1);
3215     InVals.push_back(p);
3216   }
3217 
3218   if (!OutChains.empty())
3219     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3220 
3221   return Chain;
3222 }
3223 
3224 SDValue
3225 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3226                                  bool isVarArg,
3227                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
3228                                  const SmallVectorImpl<SDValue> &OutVals,
3229                                  const SDLoc &dl, SelectionDAG &DAG) const {
3230   const MachineFunction &MF = DAG.getMachineFunction();
3231   const Function &F = MF.getFunction();
3232   Type *RetTy = MF.getFunction().getReturnType();
3233 
3234   bool isABI = (STI.getSmVersion() >= 20);
3235   assert(isABI && "Non-ABI compilation is not supported");
3236   if (!isABI)
3237     return Chain;
3238 
3239   const DataLayout &DL = DAG.getDataLayout();
3240   SmallVector<SDValue, 16> PromotedOutVals;
3241   SmallVector<EVT, 16> VTs;
3242   SmallVector<uint64_t, 16> Offsets;
3243   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3244   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3245 
3246   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3247     SDValue PromotedOutVal = OutVals[i];
3248     MVT PromotedVT;
3249     if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3250       VTs[i] = EVT(PromotedVT);
3251     }
3252     if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3253       llvm::ISD::NodeType Ext =
3254           Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3255       PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3256     }
3257     PromotedOutVals.push_back(PromotedOutVal);
3258   }
3259 
3260   auto VectorInfo = VectorizePTXValueVTs(
3261       VTs, Offsets,
3262       RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3263                        : Align(1));
3264 
3265   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3266   // 32-bits are sign extended or zero extended, depending on whether
3267   // they are signed or unsigned types.
3268   bool ExtendIntegerRetVal =
3269       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3270 
3271   SmallVector<SDValue, 6> StoreOperands;
3272   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3273     // New load/store. Record chain and offset operands.
3274     if (VectorInfo[i] & PVF_FIRST) {
3275       assert(StoreOperands.empty() && "Orphaned operand list.");
3276       StoreOperands.push_back(Chain);
3277       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3278     }
3279 
3280     SDValue OutVal = OutVals[i];
3281     SDValue RetVal = PromotedOutVals[i];
3282 
3283     if (ExtendIntegerRetVal) {
3284       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3285                                                   : ISD::ZERO_EXTEND,
3286                            dl, MVT::i32, RetVal);
3287     } else if (OutVal.getValueSizeInBits() < 16) {
3288       // Use 16-bit registers for small load-stores as it's the
3289       // smallest general purpose register size supported by NVPTX.
3290       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3291     }
3292 
3293     // Record the value to return.
3294     StoreOperands.push_back(RetVal);
3295 
3296     // That's the last element of this store op.
3297     if (VectorInfo[i] & PVF_LAST) {
3298       NVPTXISD::NodeType Op;
3299       unsigned NumElts = StoreOperands.size() - 2;
3300       switch (NumElts) {
3301       case 1:
3302         Op = NVPTXISD::StoreRetval;
3303         break;
3304       case 2:
3305         Op = NVPTXISD::StoreRetvalV2;
3306         break;
3307       case 4:
3308         Op = NVPTXISD::StoreRetvalV4;
3309         break;
3310       default:
3311         llvm_unreachable("Invalid vector info.");
3312       }
3313 
3314       // Adjust type of load/store op if we've extended the scalar
3315       // return value.
3316       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3317       Chain = DAG.getMemIntrinsicNode(
3318           Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3319           MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3320       // Cleanup vector state.
3321       StoreOperands.clear();
3322     }
3323   }
3324 
3325   return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3326 }
3327 
3328 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3329     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3330     SelectionDAG &DAG) const {
3331   if (Constraint.size() > 1)
3332     return;
3333   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3334 }
3335 
3336 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3337   switch (Intrinsic) {
3338   default:
3339     return 0;
3340 
3341   case Intrinsic::nvvm_tex_1d_v4f32_s32:
3342     return NVPTXISD::Tex1DFloatS32;
3343   case Intrinsic::nvvm_tex_1d_v4f32_f32:
3344     return NVPTXISD::Tex1DFloatFloat;
3345   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3346     return NVPTXISD::Tex1DFloatFloatLevel;
3347   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3348     return NVPTXISD::Tex1DFloatFloatGrad;
3349   case Intrinsic::nvvm_tex_1d_v4s32_s32:
3350     return NVPTXISD::Tex1DS32S32;
3351   case Intrinsic::nvvm_tex_1d_v4s32_f32:
3352     return NVPTXISD::Tex1DS32Float;
3353   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3354     return NVPTXISD::Tex1DS32FloatLevel;
3355   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3356     return NVPTXISD::Tex1DS32FloatGrad;
3357   case Intrinsic::nvvm_tex_1d_v4u32_s32:
3358     return NVPTXISD::Tex1DU32S32;
3359   case Intrinsic::nvvm_tex_1d_v4u32_f32:
3360     return NVPTXISD::Tex1DU32Float;
3361   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3362     return NVPTXISD::Tex1DU32FloatLevel;
3363   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3364     return NVPTXISD::Tex1DU32FloatGrad;
3365 
3366   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3367     return NVPTXISD::Tex1DArrayFloatS32;
3368   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3369     return NVPTXISD::Tex1DArrayFloatFloat;
3370   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3371     return NVPTXISD::Tex1DArrayFloatFloatLevel;
3372   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3373     return NVPTXISD::Tex1DArrayFloatFloatGrad;
3374   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3375     return NVPTXISD::Tex1DArrayS32S32;
3376   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3377     return NVPTXISD::Tex1DArrayS32Float;
3378   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3379     return NVPTXISD::Tex1DArrayS32FloatLevel;
3380   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3381     return NVPTXISD::Tex1DArrayS32FloatGrad;
3382   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3383     return NVPTXISD::Tex1DArrayU32S32;
3384   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3385     return NVPTXISD::Tex1DArrayU32Float;
3386   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3387     return NVPTXISD::Tex1DArrayU32FloatLevel;
3388   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3389     return NVPTXISD::Tex1DArrayU32FloatGrad;
3390 
3391   case Intrinsic::nvvm_tex_2d_v4f32_s32:
3392     return NVPTXISD::Tex2DFloatS32;
3393   case Intrinsic::nvvm_tex_2d_v4f32_f32:
3394     return NVPTXISD::Tex2DFloatFloat;
3395   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3396     return NVPTXISD::Tex2DFloatFloatLevel;
3397   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3398     return NVPTXISD::Tex2DFloatFloatGrad;
3399   case Intrinsic::nvvm_tex_2d_v4s32_s32:
3400     return NVPTXISD::Tex2DS32S32;
3401   case Intrinsic::nvvm_tex_2d_v4s32_f32:
3402     return NVPTXISD::Tex2DS32Float;
3403   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3404     return NVPTXISD::Tex2DS32FloatLevel;
3405   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3406     return NVPTXISD::Tex2DS32FloatGrad;
3407   case Intrinsic::nvvm_tex_2d_v4u32_s32:
3408     return NVPTXISD::Tex2DU32S32;
3409   case Intrinsic::nvvm_tex_2d_v4u32_f32:
3410     return NVPTXISD::Tex2DU32Float;
3411   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3412     return NVPTXISD::Tex2DU32FloatLevel;
3413   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3414     return NVPTXISD::Tex2DU32FloatGrad;
3415 
3416   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3417     return NVPTXISD::Tex2DArrayFloatS32;
3418   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3419     return NVPTXISD::Tex2DArrayFloatFloat;
3420   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3421     return NVPTXISD::Tex2DArrayFloatFloatLevel;
3422   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3423     return NVPTXISD::Tex2DArrayFloatFloatGrad;
3424   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3425     return NVPTXISD::Tex2DArrayS32S32;
3426   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3427     return NVPTXISD::Tex2DArrayS32Float;
3428   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3429     return NVPTXISD::Tex2DArrayS32FloatLevel;
3430   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3431     return NVPTXISD::Tex2DArrayS32FloatGrad;
3432   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3433     return NVPTXISD::Tex2DArrayU32S32;
3434   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3435     return NVPTXISD::Tex2DArrayU32Float;
3436   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3437     return NVPTXISD::Tex2DArrayU32FloatLevel;
3438   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3439     return NVPTXISD::Tex2DArrayU32FloatGrad;
3440 
3441   case Intrinsic::nvvm_tex_3d_v4f32_s32:
3442     return NVPTXISD::Tex3DFloatS32;
3443   case Intrinsic::nvvm_tex_3d_v4f32_f32:
3444     return NVPTXISD::Tex3DFloatFloat;
3445   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3446     return NVPTXISD::Tex3DFloatFloatLevel;
3447   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3448     return NVPTXISD::Tex3DFloatFloatGrad;
3449   case Intrinsic::nvvm_tex_3d_v4s32_s32:
3450     return NVPTXISD::Tex3DS32S32;
3451   case Intrinsic::nvvm_tex_3d_v4s32_f32:
3452     return NVPTXISD::Tex3DS32Float;
3453   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3454     return NVPTXISD::Tex3DS32FloatLevel;
3455   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3456     return NVPTXISD::Tex3DS32FloatGrad;
3457   case Intrinsic::nvvm_tex_3d_v4u32_s32:
3458     return NVPTXISD::Tex3DU32S32;
3459   case Intrinsic::nvvm_tex_3d_v4u32_f32:
3460     return NVPTXISD::Tex3DU32Float;
3461   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3462     return NVPTXISD::Tex3DU32FloatLevel;
3463   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3464     return NVPTXISD::Tex3DU32FloatGrad;
3465 
3466   case Intrinsic::nvvm_tex_cube_v4f32_f32:
3467     return NVPTXISD::TexCubeFloatFloat;
3468   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3469     return NVPTXISD::TexCubeFloatFloatLevel;
3470   case Intrinsic::nvvm_tex_cube_v4s32_f32:
3471     return NVPTXISD::TexCubeS32Float;
3472   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3473     return NVPTXISD::TexCubeS32FloatLevel;
3474   case Intrinsic::nvvm_tex_cube_v4u32_f32:
3475     return NVPTXISD::TexCubeU32Float;
3476   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3477     return NVPTXISD::TexCubeU32FloatLevel;
3478 
3479   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3480     return NVPTXISD::TexCubeArrayFloatFloat;
3481   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3482     return NVPTXISD::TexCubeArrayFloatFloatLevel;
3483   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3484     return NVPTXISD::TexCubeArrayS32Float;
3485   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3486     return NVPTXISD::TexCubeArrayS32FloatLevel;
3487   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3488     return NVPTXISD::TexCubeArrayU32Float;
3489   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3490     return NVPTXISD::TexCubeArrayU32FloatLevel;
3491 
3492   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3493     return NVPTXISD::Tld4R2DFloatFloat;
3494   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3495     return NVPTXISD::Tld4G2DFloatFloat;
3496   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3497     return NVPTXISD::Tld4B2DFloatFloat;
3498   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3499     return NVPTXISD::Tld4A2DFloatFloat;
3500   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3501     return NVPTXISD::Tld4R2DS64Float;
3502   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3503     return NVPTXISD::Tld4G2DS64Float;
3504   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3505     return NVPTXISD::Tld4B2DS64Float;
3506   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3507     return NVPTXISD::Tld4A2DS64Float;
3508   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3509     return NVPTXISD::Tld4R2DU64Float;
3510   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3511     return NVPTXISD::Tld4G2DU64Float;
3512   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3513     return NVPTXISD::Tld4B2DU64Float;
3514   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3515     return NVPTXISD::Tld4A2DU64Float;
3516 
3517   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3518     return NVPTXISD::TexUnified1DFloatS32;
3519   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3520     return NVPTXISD::TexUnified1DFloatFloat;
3521   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3522     return NVPTXISD::TexUnified1DFloatFloatLevel;
3523   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3524     return NVPTXISD::TexUnified1DFloatFloatGrad;
3525   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3526     return NVPTXISD::TexUnified1DS32S32;
3527   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3528     return NVPTXISD::TexUnified1DS32Float;
3529   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3530     return NVPTXISD::TexUnified1DS32FloatLevel;
3531   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3532     return NVPTXISD::TexUnified1DS32FloatGrad;
3533   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3534     return NVPTXISD::TexUnified1DU32S32;
3535   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3536     return NVPTXISD::TexUnified1DU32Float;
3537   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3538     return NVPTXISD::TexUnified1DU32FloatLevel;
3539   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3540     return NVPTXISD::TexUnified1DU32FloatGrad;
3541 
3542   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3543     return NVPTXISD::TexUnified1DArrayFloatS32;
3544   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3545     return NVPTXISD::TexUnified1DArrayFloatFloat;
3546   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3547     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3548   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3549     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3550   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3551     return NVPTXISD::TexUnified1DArrayS32S32;
3552   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3553     return NVPTXISD::TexUnified1DArrayS32Float;
3554   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3555     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3556   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3557     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3558   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3559     return NVPTXISD::TexUnified1DArrayU32S32;
3560   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3561     return NVPTXISD::TexUnified1DArrayU32Float;
3562   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3563     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3564   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3565     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3566 
3567   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3568     return NVPTXISD::TexUnified2DFloatS32;
3569   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3570     return NVPTXISD::TexUnified2DFloatFloat;
3571   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3572     return NVPTXISD::TexUnified2DFloatFloatLevel;
3573   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3574     return NVPTXISD::TexUnified2DFloatFloatGrad;
3575   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3576     return NVPTXISD::TexUnified2DS32S32;
3577   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3578     return NVPTXISD::TexUnified2DS32Float;
3579   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3580     return NVPTXISD::TexUnified2DS32FloatLevel;
3581   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3582     return NVPTXISD::TexUnified2DS32FloatGrad;
3583   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3584     return NVPTXISD::TexUnified2DU32S32;
3585   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3586     return NVPTXISD::TexUnified2DU32Float;
3587   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3588     return NVPTXISD::TexUnified2DU32FloatLevel;
3589   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3590     return NVPTXISD::TexUnified2DU32FloatGrad;
3591 
3592   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3593     return NVPTXISD::TexUnified2DArrayFloatS32;
3594   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3595     return NVPTXISD::TexUnified2DArrayFloatFloat;
3596   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3597     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3598   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3599     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3600   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3601     return NVPTXISD::TexUnified2DArrayS32S32;
3602   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3603     return NVPTXISD::TexUnified2DArrayS32Float;
3604   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3605     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3606   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3607     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3608   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3609     return NVPTXISD::TexUnified2DArrayU32S32;
3610   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3611     return NVPTXISD::TexUnified2DArrayU32Float;
3612   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3613     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3614   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3615     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3616 
3617   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3618     return NVPTXISD::TexUnified3DFloatS32;
3619   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3620     return NVPTXISD::TexUnified3DFloatFloat;
3621   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3622     return NVPTXISD::TexUnified3DFloatFloatLevel;
3623   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3624     return NVPTXISD::TexUnified3DFloatFloatGrad;
3625   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3626     return NVPTXISD::TexUnified3DS32S32;
3627   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3628     return NVPTXISD::TexUnified3DS32Float;
3629   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3630     return NVPTXISD::TexUnified3DS32FloatLevel;
3631   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3632     return NVPTXISD::TexUnified3DS32FloatGrad;
3633   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3634     return NVPTXISD::TexUnified3DU32S32;
3635   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3636     return NVPTXISD::TexUnified3DU32Float;
3637   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3638     return NVPTXISD::TexUnified3DU32FloatLevel;
3639   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3640     return NVPTXISD::TexUnified3DU32FloatGrad;
3641 
3642   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3643     return NVPTXISD::TexUnifiedCubeFloatFloat;
3644   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3645     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3646   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3647     return NVPTXISD::TexUnifiedCubeS32Float;
3648   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3649     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3650   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3651     return NVPTXISD::TexUnifiedCubeU32Float;
3652   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3653     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3654 
3655   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3656     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3657   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3658     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3659   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3660     return NVPTXISD::TexUnifiedCubeArrayS32Float;
3661   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3662     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3663   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3664     return NVPTXISD::TexUnifiedCubeArrayU32Float;
3665   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3666     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3667 
3668   case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3669     return NVPTXISD::TexUnifiedCubeFloatFloatGrad;
3670   case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3671     return NVPTXISD::TexUnifiedCubeS32FloatGrad;
3672   case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3673     return NVPTXISD::TexUnifiedCubeU32FloatGrad;
3674   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3675     return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;
3676   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3677     return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;
3678   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3679     return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;
3680 
3681   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3682     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3683   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3684     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3685   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3686     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3687   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3688     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3689   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3690     return NVPTXISD::Tld4UnifiedR2DS64Float;
3691   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3692     return NVPTXISD::Tld4UnifiedG2DS64Float;
3693   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3694     return NVPTXISD::Tld4UnifiedB2DS64Float;
3695   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3696     return NVPTXISD::Tld4UnifiedA2DS64Float;
3697   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3698     return NVPTXISD::Tld4UnifiedR2DU64Float;
3699   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3700     return NVPTXISD::Tld4UnifiedG2DU64Float;
3701   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3702     return NVPTXISD::Tld4UnifiedB2DU64Float;
3703   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3704     return NVPTXISD::Tld4UnifiedA2DU64Float;
3705   }
3706 }
3707 
3708 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3709   switch (Intrinsic) {
3710   default:
3711     return 0;
3712   case Intrinsic::nvvm_suld_1d_i8_clamp:
3713     return NVPTXISD::Suld1DI8Clamp;
3714   case Intrinsic::nvvm_suld_1d_i16_clamp:
3715     return NVPTXISD::Suld1DI16Clamp;
3716   case Intrinsic::nvvm_suld_1d_i32_clamp:
3717     return NVPTXISD::Suld1DI32Clamp;
3718   case Intrinsic::nvvm_suld_1d_i64_clamp:
3719     return NVPTXISD::Suld1DI64Clamp;
3720   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3721     return NVPTXISD::Suld1DV2I8Clamp;
3722   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3723     return NVPTXISD::Suld1DV2I16Clamp;
3724   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3725     return NVPTXISD::Suld1DV2I32Clamp;
3726   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3727     return NVPTXISD::Suld1DV2I64Clamp;
3728   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3729     return NVPTXISD::Suld1DV4I8Clamp;
3730   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3731     return NVPTXISD::Suld1DV4I16Clamp;
3732   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3733     return NVPTXISD::Suld1DV4I32Clamp;
3734   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3735     return NVPTXISD::Suld1DArrayI8Clamp;
3736   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3737     return NVPTXISD::Suld1DArrayI16Clamp;
3738   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3739     return NVPTXISD::Suld1DArrayI32Clamp;
3740   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3741     return NVPTXISD::Suld1DArrayI64Clamp;
3742   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3743     return NVPTXISD::Suld1DArrayV2I8Clamp;
3744   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3745     return NVPTXISD::Suld1DArrayV2I16Clamp;
3746   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3747     return NVPTXISD::Suld1DArrayV2I32Clamp;
3748   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3749     return NVPTXISD::Suld1DArrayV2I64Clamp;
3750   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3751     return NVPTXISD::Suld1DArrayV4I8Clamp;
3752   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3753     return NVPTXISD::Suld1DArrayV4I16Clamp;
3754   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3755     return NVPTXISD::Suld1DArrayV4I32Clamp;
3756   case Intrinsic::nvvm_suld_2d_i8_clamp:
3757     return NVPTXISD::Suld2DI8Clamp;
3758   case Intrinsic::nvvm_suld_2d_i16_clamp:
3759     return NVPTXISD::Suld2DI16Clamp;
3760   case Intrinsic::nvvm_suld_2d_i32_clamp:
3761     return NVPTXISD::Suld2DI32Clamp;
3762   case Intrinsic::nvvm_suld_2d_i64_clamp:
3763     return NVPTXISD::Suld2DI64Clamp;
3764   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3765     return NVPTXISD::Suld2DV2I8Clamp;
3766   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3767     return NVPTXISD::Suld2DV2I16Clamp;
3768   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3769     return NVPTXISD::Suld2DV2I32Clamp;
3770   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3771     return NVPTXISD::Suld2DV2I64Clamp;
3772   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3773     return NVPTXISD::Suld2DV4I8Clamp;
3774   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3775     return NVPTXISD::Suld2DV4I16Clamp;
3776   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3777     return NVPTXISD::Suld2DV4I32Clamp;
3778   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3779     return NVPTXISD::Suld2DArrayI8Clamp;
3780   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3781     return NVPTXISD::Suld2DArrayI16Clamp;
3782   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3783     return NVPTXISD::Suld2DArrayI32Clamp;
3784   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3785     return NVPTXISD::Suld2DArrayI64Clamp;
3786   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3787     return NVPTXISD::Suld2DArrayV2I8Clamp;
3788   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3789     return NVPTXISD::Suld2DArrayV2I16Clamp;
3790   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3791     return NVPTXISD::Suld2DArrayV2I32Clamp;
3792   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3793     return NVPTXISD::Suld2DArrayV2I64Clamp;
3794   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3795     return NVPTXISD::Suld2DArrayV4I8Clamp;
3796   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3797     return NVPTXISD::Suld2DArrayV4I16Clamp;
3798   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3799     return NVPTXISD::Suld2DArrayV4I32Clamp;
3800   case Intrinsic::nvvm_suld_3d_i8_clamp:
3801     return NVPTXISD::Suld3DI8Clamp;
3802   case Intrinsic::nvvm_suld_3d_i16_clamp:
3803     return NVPTXISD::Suld3DI16Clamp;
3804   case Intrinsic::nvvm_suld_3d_i32_clamp:
3805     return NVPTXISD::Suld3DI32Clamp;
3806   case Intrinsic::nvvm_suld_3d_i64_clamp:
3807     return NVPTXISD::Suld3DI64Clamp;
3808   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3809     return NVPTXISD::Suld3DV2I8Clamp;
3810   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3811     return NVPTXISD::Suld3DV2I16Clamp;
3812   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3813     return NVPTXISD::Suld3DV2I32Clamp;
3814   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3815     return NVPTXISD::Suld3DV2I64Clamp;
3816   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3817     return NVPTXISD::Suld3DV4I8Clamp;
3818   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3819     return NVPTXISD::Suld3DV4I16Clamp;
3820   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3821     return NVPTXISD::Suld3DV4I32Clamp;
3822   case Intrinsic::nvvm_suld_1d_i8_trap:
3823     return NVPTXISD::Suld1DI8Trap;
3824   case Intrinsic::nvvm_suld_1d_i16_trap:
3825     return NVPTXISD::Suld1DI16Trap;
3826   case Intrinsic::nvvm_suld_1d_i32_trap:
3827     return NVPTXISD::Suld1DI32Trap;
3828   case Intrinsic::nvvm_suld_1d_i64_trap:
3829     return NVPTXISD::Suld1DI64Trap;
3830   case Intrinsic::nvvm_suld_1d_v2i8_trap:
3831     return NVPTXISD::Suld1DV2I8Trap;
3832   case Intrinsic::nvvm_suld_1d_v2i16_trap:
3833     return NVPTXISD::Suld1DV2I16Trap;
3834   case Intrinsic::nvvm_suld_1d_v2i32_trap:
3835     return NVPTXISD::Suld1DV2I32Trap;
3836   case Intrinsic::nvvm_suld_1d_v2i64_trap:
3837     return NVPTXISD::Suld1DV2I64Trap;
3838   case Intrinsic::nvvm_suld_1d_v4i8_trap:
3839     return NVPTXISD::Suld1DV4I8Trap;
3840   case Intrinsic::nvvm_suld_1d_v4i16_trap:
3841     return NVPTXISD::Suld1DV4I16Trap;
3842   case Intrinsic::nvvm_suld_1d_v4i32_trap:
3843     return NVPTXISD::Suld1DV4I32Trap;
3844   case Intrinsic::nvvm_suld_1d_array_i8_trap:
3845     return NVPTXISD::Suld1DArrayI8Trap;
3846   case Intrinsic::nvvm_suld_1d_array_i16_trap:
3847     return NVPTXISD::Suld1DArrayI16Trap;
3848   case Intrinsic::nvvm_suld_1d_array_i32_trap:
3849     return NVPTXISD::Suld1DArrayI32Trap;
3850   case Intrinsic::nvvm_suld_1d_array_i64_trap:
3851     return NVPTXISD::Suld1DArrayI64Trap;
3852   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3853     return NVPTXISD::Suld1DArrayV2I8Trap;
3854   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3855     return NVPTXISD::Suld1DArrayV2I16Trap;
3856   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3857     return NVPTXISD::Suld1DArrayV2I32Trap;
3858   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3859     return NVPTXISD::Suld1DArrayV2I64Trap;
3860   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3861     return NVPTXISD::Suld1DArrayV4I8Trap;
3862   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3863     return NVPTXISD::Suld1DArrayV4I16Trap;
3864   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3865     return NVPTXISD::Suld1DArrayV4I32Trap;
3866   case Intrinsic::nvvm_suld_2d_i8_trap:
3867     return NVPTXISD::Suld2DI8Trap;
3868   case Intrinsic::nvvm_suld_2d_i16_trap:
3869     return NVPTXISD::Suld2DI16Trap;
3870   case Intrinsic::nvvm_suld_2d_i32_trap:
3871     return NVPTXISD::Suld2DI32Trap;
3872   case Intrinsic::nvvm_suld_2d_i64_trap:
3873     return NVPTXISD::Suld2DI64Trap;
3874   case Intrinsic::nvvm_suld_2d_v2i8_trap:
3875     return NVPTXISD::Suld2DV2I8Trap;
3876   case Intrinsic::nvvm_suld_2d_v2i16_trap:
3877     return NVPTXISD::Suld2DV2I16Trap;
3878   case Intrinsic::nvvm_suld_2d_v2i32_trap:
3879     return NVPTXISD::Suld2DV2I32Trap;
3880   case Intrinsic::nvvm_suld_2d_v2i64_trap:
3881     return NVPTXISD::Suld2DV2I64Trap;
3882   case Intrinsic::nvvm_suld_2d_v4i8_trap:
3883     return NVPTXISD::Suld2DV4I8Trap;
3884   case Intrinsic::nvvm_suld_2d_v4i16_trap:
3885     return NVPTXISD::Suld2DV4I16Trap;
3886   case Intrinsic::nvvm_suld_2d_v4i32_trap:
3887     return NVPTXISD::Suld2DV4I32Trap;
3888   case Intrinsic::nvvm_suld_2d_array_i8_trap:
3889     return NVPTXISD::Suld2DArrayI8Trap;
3890   case Intrinsic::nvvm_suld_2d_array_i16_trap:
3891     return NVPTXISD::Suld2DArrayI16Trap;
3892   case Intrinsic::nvvm_suld_2d_array_i32_trap:
3893     return NVPTXISD::Suld2DArrayI32Trap;
3894   case Intrinsic::nvvm_suld_2d_array_i64_trap:
3895     return NVPTXISD::Suld2DArrayI64Trap;
3896   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3897     return NVPTXISD::Suld2DArrayV2I8Trap;
3898   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3899     return NVPTXISD::Suld2DArrayV2I16Trap;
3900   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3901     return NVPTXISD::Suld2DArrayV2I32Trap;
3902   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3903     return NVPTXISD::Suld2DArrayV2I64Trap;
3904   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3905     return NVPTXISD::Suld2DArrayV4I8Trap;
3906   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3907     return NVPTXISD::Suld2DArrayV4I16Trap;
3908   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3909     return NVPTXISD::Suld2DArrayV4I32Trap;
3910   case Intrinsic::nvvm_suld_3d_i8_trap:
3911     return NVPTXISD::Suld3DI8Trap;
3912   case Intrinsic::nvvm_suld_3d_i16_trap:
3913     return NVPTXISD::Suld3DI16Trap;
3914   case Intrinsic::nvvm_suld_3d_i32_trap:
3915     return NVPTXISD::Suld3DI32Trap;
3916   case Intrinsic::nvvm_suld_3d_i64_trap:
3917     return NVPTXISD::Suld3DI64Trap;
3918   case Intrinsic::nvvm_suld_3d_v2i8_trap:
3919     return NVPTXISD::Suld3DV2I8Trap;
3920   case Intrinsic::nvvm_suld_3d_v2i16_trap:
3921     return NVPTXISD::Suld3DV2I16Trap;
3922   case Intrinsic::nvvm_suld_3d_v2i32_trap:
3923     return NVPTXISD::Suld3DV2I32Trap;
3924   case Intrinsic::nvvm_suld_3d_v2i64_trap:
3925     return NVPTXISD::Suld3DV2I64Trap;
3926   case Intrinsic::nvvm_suld_3d_v4i8_trap:
3927     return NVPTXISD::Suld3DV4I8Trap;
3928   case Intrinsic::nvvm_suld_3d_v4i16_trap:
3929     return NVPTXISD::Suld3DV4I16Trap;
3930   case Intrinsic::nvvm_suld_3d_v4i32_trap:
3931     return NVPTXISD::Suld3DV4I32Trap;
3932   case Intrinsic::nvvm_suld_1d_i8_zero:
3933     return NVPTXISD::Suld1DI8Zero;
3934   case Intrinsic::nvvm_suld_1d_i16_zero:
3935     return NVPTXISD::Suld1DI16Zero;
3936   case Intrinsic::nvvm_suld_1d_i32_zero:
3937     return NVPTXISD::Suld1DI32Zero;
3938   case Intrinsic::nvvm_suld_1d_i64_zero:
3939     return NVPTXISD::Suld1DI64Zero;
3940   case Intrinsic::nvvm_suld_1d_v2i8_zero:
3941     return NVPTXISD::Suld1DV2I8Zero;
3942   case Intrinsic::nvvm_suld_1d_v2i16_zero:
3943     return NVPTXISD::Suld1DV2I16Zero;
3944   case Intrinsic::nvvm_suld_1d_v2i32_zero:
3945     return NVPTXISD::Suld1DV2I32Zero;
3946   case Intrinsic::nvvm_suld_1d_v2i64_zero:
3947     return NVPTXISD::Suld1DV2I64Zero;
3948   case Intrinsic::nvvm_suld_1d_v4i8_zero:
3949     return NVPTXISD::Suld1DV4I8Zero;
3950   case Intrinsic::nvvm_suld_1d_v4i16_zero:
3951     return NVPTXISD::Suld1DV4I16Zero;
3952   case Intrinsic::nvvm_suld_1d_v4i32_zero:
3953     return NVPTXISD::Suld1DV4I32Zero;
3954   case Intrinsic::nvvm_suld_1d_array_i8_zero:
3955     return NVPTXISD::Suld1DArrayI8Zero;
3956   case Intrinsic::nvvm_suld_1d_array_i16_zero:
3957     return NVPTXISD::Suld1DArrayI16Zero;
3958   case Intrinsic::nvvm_suld_1d_array_i32_zero:
3959     return NVPTXISD::Suld1DArrayI32Zero;
3960   case Intrinsic::nvvm_suld_1d_array_i64_zero:
3961     return NVPTXISD::Suld1DArrayI64Zero;
3962   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3963     return NVPTXISD::Suld1DArrayV2I8Zero;
3964   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3965     return NVPTXISD::Suld1DArrayV2I16Zero;
3966   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3967     return NVPTXISD::Suld1DArrayV2I32Zero;
3968   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3969     return NVPTXISD::Suld1DArrayV2I64Zero;
3970   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3971     return NVPTXISD::Suld1DArrayV4I8Zero;
3972   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3973     return NVPTXISD::Suld1DArrayV4I16Zero;
3974   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3975     return NVPTXISD::Suld1DArrayV4I32Zero;
3976   case Intrinsic::nvvm_suld_2d_i8_zero:
3977     return NVPTXISD::Suld2DI8Zero;
3978   case Intrinsic::nvvm_suld_2d_i16_zero:
3979     return NVPTXISD::Suld2DI16Zero;
3980   case Intrinsic::nvvm_suld_2d_i32_zero:
3981     return NVPTXISD::Suld2DI32Zero;
3982   case Intrinsic::nvvm_suld_2d_i64_zero:
3983     return NVPTXISD::Suld2DI64Zero;
3984   case Intrinsic::nvvm_suld_2d_v2i8_zero:
3985     return NVPTXISD::Suld2DV2I8Zero;
3986   case Intrinsic::nvvm_suld_2d_v2i16_zero:
3987     return NVPTXISD::Suld2DV2I16Zero;
3988   case Intrinsic::nvvm_suld_2d_v2i32_zero:
3989     return NVPTXISD::Suld2DV2I32Zero;
3990   case Intrinsic::nvvm_suld_2d_v2i64_zero:
3991     return NVPTXISD::Suld2DV2I64Zero;
3992   case Intrinsic::nvvm_suld_2d_v4i8_zero:
3993     return NVPTXISD::Suld2DV4I8Zero;
3994   case Intrinsic::nvvm_suld_2d_v4i16_zero:
3995     return NVPTXISD::Suld2DV4I16Zero;
3996   case Intrinsic::nvvm_suld_2d_v4i32_zero:
3997     return NVPTXISD::Suld2DV4I32Zero;
3998   case Intrinsic::nvvm_suld_2d_array_i8_zero:
3999     return NVPTXISD::Suld2DArrayI8Zero;
4000   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4001     return NVPTXISD::Suld2DArrayI16Zero;
4002   case Intrinsic::nvvm_suld_2d_array_i32_zero:
4003     return NVPTXISD::Suld2DArrayI32Zero;
4004   case Intrinsic::nvvm_suld_2d_array_i64_zero:
4005     return NVPTXISD::Suld2DArrayI64Zero;
4006   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4007     return NVPTXISD::Suld2DArrayV2I8Zero;
4008   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4009     return NVPTXISD::Suld2DArrayV2I16Zero;
4010   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4011     return NVPTXISD::Suld2DArrayV2I32Zero;
4012   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4013     return NVPTXISD::Suld2DArrayV2I64Zero;
4014   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4015     return NVPTXISD::Suld2DArrayV4I8Zero;
4016   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4017     return NVPTXISD::Suld2DArrayV4I16Zero;
4018   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4019     return NVPTXISD::Suld2DArrayV4I32Zero;
4020   case Intrinsic::nvvm_suld_3d_i8_zero:
4021     return NVPTXISD::Suld3DI8Zero;
4022   case Intrinsic::nvvm_suld_3d_i16_zero:
4023     return NVPTXISD::Suld3DI16Zero;
4024   case Intrinsic::nvvm_suld_3d_i32_zero:
4025     return NVPTXISD::Suld3DI32Zero;
4026   case Intrinsic::nvvm_suld_3d_i64_zero:
4027     return NVPTXISD::Suld3DI64Zero;
4028   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4029     return NVPTXISD::Suld3DV2I8Zero;
4030   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4031     return NVPTXISD::Suld3DV2I16Zero;
4032   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4033     return NVPTXISD::Suld3DV2I32Zero;
4034   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4035     return NVPTXISD::Suld3DV2I64Zero;
4036   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4037     return NVPTXISD::Suld3DV4I8Zero;
4038   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4039     return NVPTXISD::Suld3DV4I16Zero;
4040   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4041     return NVPTXISD::Suld3DV4I32Zero;
4042   }
4043 }
4044 
4045 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4046 // TgtMemIntrinsic
4047 // because we need the information that is only available in the "Value" type
4048 // of destination
4049 // pointer. In particular, the address space information.
4050 bool NVPTXTargetLowering::getTgtMemIntrinsic(
4051     IntrinsicInfo &Info, const CallInst &I,
4052     MachineFunction &MF, unsigned Intrinsic) const {
4053   switch (Intrinsic) {
4054   default:
4055     return false;
4056   case Intrinsic::nvvm_match_all_sync_i32p:
4057   case Intrinsic::nvvm_match_all_sync_i64p:
4058     Info.opc = ISD::INTRINSIC_W_CHAIN;
4059     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4060     // in order to model data exchange with other threads, but perform no real
4061     // memory accesses.
4062     Info.memVT = MVT::i1;
4063 
4064     // Our result depends on both our and other thread's arguments.
4065     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4066     return true;
4067   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4068   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4069   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4070   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4071   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4072   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4073   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4074   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4075   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4076   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4077   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4078   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4079   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4080   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4081   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4082   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4083   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4084   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4085   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4086   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4087   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4088   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4089   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4090   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4091     Info.opc = ISD::INTRINSIC_W_CHAIN;
4092     Info.memVT = MVT::v8f16;
4093     Info.ptrVal = I.getArgOperand(0);
4094     Info.offset = 0;
4095     Info.flags = MachineMemOperand::MOLoad;
4096     Info.align = Align(16);
4097     return true;
4098   }
4099   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4100   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4101   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4102   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4103   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4104   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4105   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4106   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4107   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4108   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4109   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4110   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4111   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4112   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4113   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4114   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4115   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4116   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4117   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4118   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4119   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4120   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4121   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4122   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4123     Info.opc = ISD::INTRINSIC_W_CHAIN;
4124     Info.memVT = MVT::v2i32;
4125     Info.ptrVal = I.getArgOperand(0);
4126     Info.offset = 0;
4127     Info.flags = MachineMemOperand::MOLoad;
4128     Info.align = Align(8);
4129     return true;
4130   }
4131 
4132   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4133   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4134   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4135   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4136   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4137   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4138   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4139   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4140   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4141   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4142   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4143   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4144   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4145   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4146   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4147   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4148 
4149   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4150   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4151   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4152   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4153   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4154   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4155   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4156   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4157   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4158   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4159   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4160   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4161   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4162   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4163   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4164   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4165   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4166   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4167     Info.opc = ISD::INTRINSIC_W_CHAIN;
4168     Info.memVT = MVT::v4i32;
4169     Info.ptrVal = I.getArgOperand(0);
4170     Info.offset = 0;
4171     Info.flags = MachineMemOperand::MOLoad;
4172     Info.align = Align(16);
4173     return true;
4174   }
4175 
4176   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4177   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4178   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4179   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4180   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4181   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4182   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4183   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4184 
4185   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4186   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4187   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4188   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4189   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4190   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4191   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4192   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4193   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4194   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4195   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4196   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4197   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4198   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4199   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4200   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4201   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4202   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4203   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4204   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4205   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4206   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4207     Info.opc = ISD::INTRINSIC_W_CHAIN;
4208     Info.memVT = MVT::i32;
4209     Info.ptrVal = I.getArgOperand(0);
4210     Info.offset = 0;
4211     Info.flags = MachineMemOperand::MOLoad;
4212     Info.align = Align(4);
4213     return true;
4214   }
4215 
4216   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4217   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4218   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4219   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4220   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4221   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4222   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4223   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4224   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4225   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4226   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4227   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4228     Info.opc = ISD::INTRINSIC_W_CHAIN;
4229     Info.memVT = MVT::v4f16;
4230     Info.ptrVal = I.getArgOperand(0);
4231     Info.offset = 0;
4232     Info.flags = MachineMemOperand::MOLoad;
4233     Info.align = Align(16);
4234     return true;
4235   }
4236 
4237   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4238   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4239   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4240   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4241   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4242   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4243   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4244   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4245   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4246   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4247   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4248   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4249   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4250   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4251   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4252   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4253     Info.opc = ISD::INTRINSIC_W_CHAIN;
4254     Info.memVT = MVT::v8f32;
4255     Info.ptrVal = I.getArgOperand(0);
4256     Info.offset = 0;
4257     Info.flags = MachineMemOperand::MOLoad;
4258     Info.align = Align(16);
4259     return true;
4260   }
4261 
4262   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4263   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4264   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4265   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4266 
4267   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4268   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4269   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4270   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4271 
4272   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4273   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4274   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4275   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4276   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4277   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4278   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4279   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4280   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4281   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4282   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4283   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4284     Info.opc = ISD::INTRINSIC_W_CHAIN;
4285     Info.memVT = MVT::v8i32;
4286     Info.ptrVal = I.getArgOperand(0);
4287     Info.offset = 0;
4288     Info.flags = MachineMemOperand::MOLoad;
4289     Info.align = Align(16);
4290     return true;
4291   }
4292 
4293   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4294   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4295   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4296   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4297   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4298   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4299   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4300   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4301   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4302   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4303     Info.opc = ISD::INTRINSIC_W_CHAIN;
4304     Info.memVT = MVT::v2i32;
4305     Info.ptrVal = I.getArgOperand(0);
4306     Info.offset = 0;
4307     Info.flags = MachineMemOperand::MOLoad;
4308     Info.align = Align(8);
4309     return true;
4310   }
4311 
4312   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4313   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4314   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4315   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4316 
4317   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4318   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4319   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4320   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4321     Info.opc = ISD::INTRINSIC_W_CHAIN;
4322     Info.memVT = MVT::f64;
4323     Info.ptrVal = I.getArgOperand(0);
4324     Info.offset = 0;
4325     Info.flags = MachineMemOperand::MOLoad;
4326     Info.align = Align(8);
4327     return true;
4328   }
4329 
4330   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4331   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4332   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4333   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4334     Info.opc = ISD::INTRINSIC_W_CHAIN;
4335     Info.memVT = MVT::v2f64;
4336     Info.ptrVal = I.getArgOperand(0);
4337     Info.offset = 0;
4338     Info.flags = MachineMemOperand::MOLoad;
4339     Info.align = Align(16);
4340     return true;
4341   }
4342 
4343   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4344   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4345   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4346   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4347   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4348   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4349   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4350   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4351   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4352   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4353   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4354   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4355     Info.opc = ISD::INTRINSIC_VOID;
4356     Info.memVT = MVT::v4f16;
4357     Info.ptrVal = I.getArgOperand(0);
4358     Info.offset = 0;
4359     Info.flags = MachineMemOperand::MOStore;
4360     Info.align = Align(16);
4361     return true;
4362   }
4363 
4364   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4365   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4366   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4367   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4368   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4369   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4370   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4371   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4372   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4373   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4374   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4375   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4376   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4377   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4378   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4379   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4380     Info.opc = ISD::INTRINSIC_VOID;
4381     Info.memVT = MVT::v8f32;
4382     Info.ptrVal = I.getArgOperand(0);
4383     Info.offset = 0;
4384     Info.flags = MachineMemOperand::MOStore;
4385     Info.align = Align(16);
4386     return true;
4387   }
4388 
4389   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4390   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4391   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4392   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4393   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4394   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4395   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4396   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4397   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4398   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4399   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4400   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4401     Info.opc = ISD::INTRINSIC_VOID;
4402     Info.memVT = MVT::v8i32;
4403     Info.ptrVal = I.getArgOperand(0);
4404     Info.offset = 0;
4405     Info.flags = MachineMemOperand::MOStore;
4406     Info.align = Align(16);
4407     return true;
4408   }
4409 
4410   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4411   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4412   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4413   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4414   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4415   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4416   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4417   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4418     Info.opc = ISD::INTRINSIC_VOID;
4419     Info.memVT = MVT::v2i32;
4420     Info.ptrVal = I.getArgOperand(0);
4421     Info.offset = 0;
4422     Info.flags = MachineMemOperand::MOStore;
4423     Info.align = Align(8);
4424     return true;
4425   }
4426 
4427   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4428   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4429   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4430   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4431     Info.opc = ISD::INTRINSIC_VOID;
4432     Info.memVT = MVT::v2f64;
4433     Info.ptrVal = I.getArgOperand(0);
4434     Info.offset = 0;
4435     Info.flags = MachineMemOperand::MOStore;
4436     Info.align = Align(16);
4437     return true;
4438   }
4439 
4440   case Intrinsic::nvvm_atomic_load_inc_32:
4441   case Intrinsic::nvvm_atomic_load_dec_32:
4442 
4443   case Intrinsic::nvvm_atomic_add_gen_f_cta:
4444   case Intrinsic::nvvm_atomic_add_gen_f_sys:
4445   case Intrinsic::nvvm_atomic_add_gen_i_cta:
4446   case Intrinsic::nvvm_atomic_add_gen_i_sys:
4447   case Intrinsic::nvvm_atomic_and_gen_i_cta:
4448   case Intrinsic::nvvm_atomic_and_gen_i_sys:
4449   case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4450   case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4451   case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4452   case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4453   case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4454   case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4455   case Intrinsic::nvvm_atomic_max_gen_i_cta:
4456   case Intrinsic::nvvm_atomic_max_gen_i_sys:
4457   case Intrinsic::nvvm_atomic_min_gen_i_cta:
4458   case Intrinsic::nvvm_atomic_min_gen_i_sys:
4459   case Intrinsic::nvvm_atomic_or_gen_i_cta:
4460   case Intrinsic::nvvm_atomic_or_gen_i_sys:
4461   case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4462   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4463   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4464   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4465     auto &DL = I.getModule()->getDataLayout();
4466     Info.opc = ISD::INTRINSIC_W_CHAIN;
4467     Info.memVT = getValueType(DL, I.getType());
4468     Info.ptrVal = I.getArgOperand(0);
4469     Info.offset = 0;
4470     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4471     Info.align.reset();
4472     return true;
4473   }
4474 
4475   case Intrinsic::nvvm_ldu_global_i:
4476   case Intrinsic::nvvm_ldu_global_f:
4477   case Intrinsic::nvvm_ldu_global_p: {
4478     auto &DL = I.getModule()->getDataLayout();
4479     Info.opc = ISD::INTRINSIC_W_CHAIN;
4480     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4481       Info.memVT = getValueType(DL, I.getType());
4482     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4483       Info.memVT = getPointerTy(DL);
4484     else
4485       Info.memVT = getValueType(DL, I.getType());
4486     Info.ptrVal = I.getArgOperand(0);
4487     Info.offset = 0;
4488     Info.flags = MachineMemOperand::MOLoad;
4489     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4490 
4491     return true;
4492   }
4493   case Intrinsic::nvvm_ldg_global_i:
4494   case Intrinsic::nvvm_ldg_global_f:
4495   case Intrinsic::nvvm_ldg_global_p: {
4496     auto &DL = I.getModule()->getDataLayout();
4497 
4498     Info.opc = ISD::INTRINSIC_W_CHAIN;
4499     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4500       Info.memVT = getValueType(DL, I.getType());
4501     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4502       Info.memVT = getPointerTy(DL);
4503     else
4504       Info.memVT = getValueType(DL, I.getType());
4505     Info.ptrVal = I.getArgOperand(0);
4506     Info.offset = 0;
4507     Info.flags = MachineMemOperand::MOLoad;
4508     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4509 
4510     return true;
4511   }
4512 
4513   case Intrinsic::nvvm_tex_1d_v4f32_s32:
4514   case Intrinsic::nvvm_tex_1d_v4f32_f32:
4515   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4516   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4517   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4518   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4519   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4520   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4521   case Intrinsic::nvvm_tex_2d_v4f32_s32:
4522   case Intrinsic::nvvm_tex_2d_v4f32_f32:
4523   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4524   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4525   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4526   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4527   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4528   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4529   case Intrinsic::nvvm_tex_3d_v4f32_s32:
4530   case Intrinsic::nvvm_tex_3d_v4f32_f32:
4531   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4532   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4533   case Intrinsic::nvvm_tex_cube_v4f32_f32:
4534   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4535   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4536   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4537   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4538   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4539   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4540   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4541   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4542   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4543   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4544   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4545   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4546   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4547   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4548   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4549   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4550   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4551   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4552   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4553   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4554   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4555   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4556   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4557   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4558   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4559   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4560   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4561   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4562   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4563   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4564   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4565   case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4566   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4567   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4568   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4569   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4570   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4571     Info.opc = getOpcForTextureInstr(Intrinsic);
4572     Info.memVT = MVT::v4f32;
4573     Info.ptrVal = nullptr;
4574     Info.offset = 0;
4575     Info.flags = MachineMemOperand::MOLoad;
4576     Info.align = Align(16);
4577     return true;
4578 
4579   case Intrinsic::nvvm_tex_1d_v4s32_s32:
4580   case Intrinsic::nvvm_tex_1d_v4s32_f32:
4581   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4582   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4583   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4584   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4585   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4586   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4587   case Intrinsic::nvvm_tex_2d_v4s32_s32:
4588   case Intrinsic::nvvm_tex_2d_v4s32_f32:
4589   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4590   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4591   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4592   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4593   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4594   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4595   case Intrinsic::nvvm_tex_3d_v4s32_s32:
4596   case Intrinsic::nvvm_tex_3d_v4s32_f32:
4597   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4598   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4599   case Intrinsic::nvvm_tex_cube_v4s32_f32:
4600   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4601   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4602   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4603   case Intrinsic::nvvm_tex_cube_v4u32_f32:
4604   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4605   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4606   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4607   case Intrinsic::nvvm_tex_1d_v4u32_s32:
4608   case Intrinsic::nvvm_tex_1d_v4u32_f32:
4609   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4610   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4611   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4612   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4613   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4614   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4615   case Intrinsic::nvvm_tex_2d_v4u32_s32:
4616   case Intrinsic::nvvm_tex_2d_v4u32_f32:
4617   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4618   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4619   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4620   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4621   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4622   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4623   case Intrinsic::nvvm_tex_3d_v4u32_s32:
4624   case Intrinsic::nvvm_tex_3d_v4u32_f32:
4625   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4626   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4627   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4628   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4629   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4630   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4631   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4632   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4633   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4634   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4635   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4636   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4637   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4638   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4639   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4640   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4641   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4642   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4643   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4644   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4645   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4646   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4647   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4648   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4649   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4650   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4651   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4652   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4653   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4654   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4655   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4656   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4657   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4658   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4659   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4660   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4661   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4662   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4663   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4664   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4665   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4666   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4667   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4668   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4669   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4670   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4671   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4672   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4673   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4674   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4675   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4676   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4677   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4678   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4679   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4680   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4681   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4682   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4683   case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4684   case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4685   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4686   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4687   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4688   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4689   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4690   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4691   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4692   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4693   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4694   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4695     Info.opc = getOpcForTextureInstr(Intrinsic);
4696     Info.memVT = MVT::v4i32;
4697     Info.ptrVal = nullptr;
4698     Info.offset = 0;
4699     Info.flags = MachineMemOperand::MOLoad;
4700     Info.align = Align(16);
4701     return true;
4702 
4703   case Intrinsic::nvvm_suld_1d_i8_clamp:
4704   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4705   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4706   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4707   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4708   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4709   case Intrinsic::nvvm_suld_2d_i8_clamp:
4710   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4711   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4712   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4713   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4714   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4715   case Intrinsic::nvvm_suld_3d_i8_clamp:
4716   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4717   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4718   case Intrinsic::nvvm_suld_1d_i8_trap:
4719   case Intrinsic::nvvm_suld_1d_v2i8_trap:
4720   case Intrinsic::nvvm_suld_1d_v4i8_trap:
4721   case Intrinsic::nvvm_suld_1d_array_i8_trap:
4722   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4723   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4724   case Intrinsic::nvvm_suld_2d_i8_trap:
4725   case Intrinsic::nvvm_suld_2d_v2i8_trap:
4726   case Intrinsic::nvvm_suld_2d_v4i8_trap:
4727   case Intrinsic::nvvm_suld_2d_array_i8_trap:
4728   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4729   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4730   case Intrinsic::nvvm_suld_3d_i8_trap:
4731   case Intrinsic::nvvm_suld_3d_v2i8_trap:
4732   case Intrinsic::nvvm_suld_3d_v4i8_trap:
4733   case Intrinsic::nvvm_suld_1d_i8_zero:
4734   case Intrinsic::nvvm_suld_1d_v2i8_zero:
4735   case Intrinsic::nvvm_suld_1d_v4i8_zero:
4736   case Intrinsic::nvvm_suld_1d_array_i8_zero:
4737   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4738   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4739   case Intrinsic::nvvm_suld_2d_i8_zero:
4740   case Intrinsic::nvvm_suld_2d_v2i8_zero:
4741   case Intrinsic::nvvm_suld_2d_v4i8_zero:
4742   case Intrinsic::nvvm_suld_2d_array_i8_zero:
4743   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4744   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4745   case Intrinsic::nvvm_suld_3d_i8_zero:
4746   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4747   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4748     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4749     Info.memVT = MVT::i8;
4750     Info.ptrVal = nullptr;
4751     Info.offset = 0;
4752     Info.flags = MachineMemOperand::MOLoad;
4753     Info.align = Align(16);
4754     return true;
4755 
4756   case Intrinsic::nvvm_suld_1d_i16_clamp:
4757   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4758   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4759   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4760   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4761   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4762   case Intrinsic::nvvm_suld_2d_i16_clamp:
4763   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4764   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4765   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4766   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4767   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4768   case Intrinsic::nvvm_suld_3d_i16_clamp:
4769   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4770   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4771   case Intrinsic::nvvm_suld_1d_i16_trap:
4772   case Intrinsic::nvvm_suld_1d_v2i16_trap:
4773   case Intrinsic::nvvm_suld_1d_v4i16_trap:
4774   case Intrinsic::nvvm_suld_1d_array_i16_trap:
4775   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4776   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4777   case Intrinsic::nvvm_suld_2d_i16_trap:
4778   case Intrinsic::nvvm_suld_2d_v2i16_trap:
4779   case Intrinsic::nvvm_suld_2d_v4i16_trap:
4780   case Intrinsic::nvvm_suld_2d_array_i16_trap:
4781   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4782   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4783   case Intrinsic::nvvm_suld_3d_i16_trap:
4784   case Intrinsic::nvvm_suld_3d_v2i16_trap:
4785   case Intrinsic::nvvm_suld_3d_v4i16_trap:
4786   case Intrinsic::nvvm_suld_1d_i16_zero:
4787   case Intrinsic::nvvm_suld_1d_v2i16_zero:
4788   case Intrinsic::nvvm_suld_1d_v4i16_zero:
4789   case Intrinsic::nvvm_suld_1d_array_i16_zero:
4790   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4791   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4792   case Intrinsic::nvvm_suld_2d_i16_zero:
4793   case Intrinsic::nvvm_suld_2d_v2i16_zero:
4794   case Intrinsic::nvvm_suld_2d_v4i16_zero:
4795   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4796   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4797   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4798   case Intrinsic::nvvm_suld_3d_i16_zero:
4799   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4800   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4801     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4802     Info.memVT = MVT::i16;
4803     Info.ptrVal = nullptr;
4804     Info.offset = 0;
4805     Info.flags = MachineMemOperand::MOLoad;
4806     Info.align = Align(16);
4807     return true;
4808 
4809   case Intrinsic::nvvm_suld_1d_i32_clamp:
4810   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4811   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4812   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4813   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4814   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4815   case Intrinsic::nvvm_suld_2d_i32_clamp:
4816   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4817   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4818   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4819   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4820   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4821   case Intrinsic::nvvm_suld_3d_i32_clamp:
4822   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4823   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4824   case Intrinsic::nvvm_suld_1d_i32_trap:
4825   case Intrinsic::nvvm_suld_1d_v2i32_trap:
4826   case Intrinsic::nvvm_suld_1d_v4i32_trap:
4827   case Intrinsic::nvvm_suld_1d_array_i32_trap:
4828   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4829   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4830   case Intrinsic::nvvm_suld_2d_i32_trap:
4831   case Intrinsic::nvvm_suld_2d_v2i32_trap:
4832   case Intrinsic::nvvm_suld_2d_v4i32_trap:
4833   case Intrinsic::nvvm_suld_2d_array_i32_trap:
4834   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4835   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4836   case Intrinsic::nvvm_suld_3d_i32_trap:
4837   case Intrinsic::nvvm_suld_3d_v2i32_trap:
4838   case Intrinsic::nvvm_suld_3d_v4i32_trap:
4839   case Intrinsic::nvvm_suld_1d_i32_zero:
4840   case Intrinsic::nvvm_suld_1d_v2i32_zero:
4841   case Intrinsic::nvvm_suld_1d_v4i32_zero:
4842   case Intrinsic::nvvm_suld_1d_array_i32_zero:
4843   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4844   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4845   case Intrinsic::nvvm_suld_2d_i32_zero:
4846   case Intrinsic::nvvm_suld_2d_v2i32_zero:
4847   case Intrinsic::nvvm_suld_2d_v4i32_zero:
4848   case Intrinsic::nvvm_suld_2d_array_i32_zero:
4849   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4850   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4851   case Intrinsic::nvvm_suld_3d_i32_zero:
4852   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4853   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4854     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4855     Info.memVT = MVT::i32;
4856     Info.ptrVal = nullptr;
4857     Info.offset = 0;
4858     Info.flags = MachineMemOperand::MOLoad;
4859     Info.align = Align(16);
4860     return true;
4861 
4862   case Intrinsic::nvvm_suld_1d_i64_clamp:
4863   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4864   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4865   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4866   case Intrinsic::nvvm_suld_2d_i64_clamp:
4867   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4868   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4869   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4870   case Intrinsic::nvvm_suld_3d_i64_clamp:
4871   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4872   case Intrinsic::nvvm_suld_1d_i64_trap:
4873   case Intrinsic::nvvm_suld_1d_v2i64_trap:
4874   case Intrinsic::nvvm_suld_1d_array_i64_trap:
4875   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4876   case Intrinsic::nvvm_suld_2d_i64_trap:
4877   case Intrinsic::nvvm_suld_2d_v2i64_trap:
4878   case Intrinsic::nvvm_suld_2d_array_i64_trap:
4879   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4880   case Intrinsic::nvvm_suld_3d_i64_trap:
4881   case Intrinsic::nvvm_suld_3d_v2i64_trap:
4882   case Intrinsic::nvvm_suld_1d_i64_zero:
4883   case Intrinsic::nvvm_suld_1d_v2i64_zero:
4884   case Intrinsic::nvvm_suld_1d_array_i64_zero:
4885   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4886   case Intrinsic::nvvm_suld_2d_i64_zero:
4887   case Intrinsic::nvvm_suld_2d_v2i64_zero:
4888   case Intrinsic::nvvm_suld_2d_array_i64_zero:
4889   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4890   case Intrinsic::nvvm_suld_3d_i64_zero:
4891   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4892     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4893     Info.memVT = MVT::i64;
4894     Info.ptrVal = nullptr;
4895     Info.offset = 0;
4896     Info.flags = MachineMemOperand::MOLoad;
4897     Info.align = Align(16);
4898     return true;
4899   }
4900   return false;
4901 }
4902 
4903 /// getFunctionParamOptimizedAlign - since function arguments are passed via
4904 /// .param space, we may want to increase their alignment in a way that
4905 /// ensures that we can effectively vectorize their loads & stores. We can
4906 /// increase alignment only if the function has internal or has private
4907 /// linkage as for other linkage types callers may already rely on default
4908 /// alignment. To allow using 128-bit vectorized loads/stores, this function
4909 /// ensures that alignment is 16 or greater.
4910 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
4911     const Function *F, Type *ArgTy, const DataLayout &DL) const {
4912   const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();
4913 
4914   // If a function has linkage different from internal or private, we
4915   // must use default ABI alignment as external users rely on it. Same
4916   // for a function that may be called from a function pointer.
4917   if (!F || !F->hasLocalLinkage() ||
4918       F->hasAddressTaken(/*Users=*/nullptr,
4919                          /*IgnoreCallbackUses=*/false,
4920                          /*IgnoreAssumeLikeCalls=*/true,
4921                          /*IgnoreLLVMUsed=*/true))
4922     return Align(ABITypeAlign);
4923 
4924   assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4925   return Align(std::max(uint64_t(16), ABITypeAlign));
4926 }
4927 
4928 /// Helper for computing alignment of a device function byval parameter.
4929 Align NVPTXTargetLowering::getFunctionByValParamAlign(
4930     const Function *F, Type *ArgTy, Align InitialAlign,
4931     const DataLayout &DL) const {
4932   Align ArgAlign = InitialAlign;
4933   // Try to increase alignment to enhance vectorization options.
4934   if (F)
4935     ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4936 
4937   // Old ptx versions have a bug. When PTX code takes address of
4938   // byval parameter with alignment < 4, ptxas generates code to
4939   // spill argument into memory. Alas on sm_50+ ptxas generates
4940   // SASS code that fails with misaligned access. To work around
4941   // the problem, make sure that we align byval parameters by at
4942   // least 4. This bug seems to be fixed at least starting from
4943   // ptxas > 9.0.
4944   // TODO: remove this after verifying the bug is not reproduced
4945   // on non-deprecated ptxas versions.
4946   if (ForceMinByValParamAlign)
4947     ArgAlign = std::max(ArgAlign, Align(4));
4948 
4949   return ArgAlign;
4950 }
4951 
4952 // Helper for getting a function parameter name. Name is composed from
4953 // its index and the function name. Negative index corresponds to special
4954 // parameter (unsized array) used for passing variable arguments.
4955 std::string NVPTXTargetLowering::getParamName(const Function *F,
4956                                               int Idx) const {
4957   std::string ParamName;
4958   raw_string_ostream ParamStr(ParamName);
4959 
4960   ParamStr << getTargetMachine().getSymbol(F)->getName();
4961   if (Idx < 0)
4962     ParamStr << "_vararg";
4963   else
4964     ParamStr << "_param_" << Idx;
4965 
4966   return ParamName;
4967 }
4968 
4969 /// isLegalAddressingMode - Return true if the addressing mode represented
4970 /// by AM is legal for this target, for a load/store of the specified type.
4971 /// Used to guide target specific optimizations, like loop strength reduction
4972 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
4973 /// (CodeGenPrepare.cpp)
4974 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4975                                                 const AddrMode &AM, Type *Ty,
4976                                                 unsigned AS, Instruction *I) const {
4977   // AddrMode - This represents an addressing mode of:
4978   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4979   //
4980   // The legal address modes are
4981   // - [avar]
4982   // - [areg]
4983   // - [areg+immoff]
4984   // - [immAddr]
4985 
4986   if (AM.BaseGV) {
4987     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4988   }
4989 
4990   switch (AM.Scale) {
4991   case 0: // "r", "r+i" or "i" is allowed
4992     break;
4993   case 1:
4994     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4995       return false;
4996     // Otherwise we have r+i.
4997     break;
4998   default:
4999     // No scale > 1 is allowed
5000     return false;
5001   }
5002   return true;
5003 }
5004 
5005 //===----------------------------------------------------------------------===//
5006 //                         NVPTX Inline Assembly Support
5007 //===----------------------------------------------------------------------===//
5008 
5009 /// getConstraintType - Given a constraint letter, return the type of
5010 /// constraint it is for this target.
5011 NVPTXTargetLowering::ConstraintType
5012 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
5013   if (Constraint.size() == 1) {
5014     switch (Constraint[0]) {
5015     default:
5016       break;
5017     case 'b':
5018     case 'r':
5019     case 'h':
5020     case 'c':
5021     case 'l':
5022     case 'f':
5023     case 'd':
5024     case '0':
5025     case 'N':
5026       return C_RegisterClass;
5027     }
5028   }
5029   return TargetLowering::getConstraintType(Constraint);
5030 }
5031 
5032 std::pair<unsigned, const TargetRegisterClass *>
5033 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5034                                                   StringRef Constraint,
5035                                                   MVT VT) const {
5036   if (Constraint.size() == 1) {
5037     switch (Constraint[0]) {
5038     case 'b':
5039       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5040     case 'c':
5041       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5042     case 'h':
5043       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5044     case 'r':
5045       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5046     case 'l':
5047     case 'N':
5048       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5049     case 'f':
5050       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5051     case 'd':
5052       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5053     }
5054   }
5055   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5056 }
5057 
5058 //===----------------------------------------------------------------------===//
5059 //                         NVPTX DAG Combining
5060 //===----------------------------------------------------------------------===//
5061 
5062 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5063                                    CodeGenOptLevel OptLevel) const {
5064   // Always honor command-line argument
5065   if (FMAContractLevelOpt.getNumOccurrences() > 0)
5066     return FMAContractLevelOpt > 0;
5067 
5068   // Do not contract if we're not optimizing the code.
5069   if (OptLevel == CodeGenOptLevel::None)
5070     return false;
5071 
5072   // Honor TargetOptions flags that explicitly say fusion is okay.
5073   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5074     return true;
5075 
5076   return allowUnsafeFPMath(MF);
5077 }
5078 
5079 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5080   // Honor TargetOptions flags that explicitly say unsafe math is okay.
5081   if (MF.getTarget().Options.UnsafeFPMath)
5082     return true;
5083 
5084   // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5085   const Function &F = MF.getFunction();
5086   return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5087 }
5088 
5089 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5090 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
5091 /// called with the default operands, and if that fails, with commuted
5092 /// operands.
5093 static SDValue PerformADDCombineWithOperands(
5094     SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
5095     const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5096   SelectionDAG  &DAG = DCI.DAG;
5097   // Skip non-integer, non-scalar case
5098   EVT VT=N0.getValueType();
5099   if (VT.isVector())
5100     return SDValue();
5101 
5102   // fold (add (mul a, b), c) -> (mad a, b, c)
5103   //
5104   if (N0.getOpcode() == ISD::MUL) {
5105     assert (VT.isInteger());
5106     // For integer:
5107     // Since integer multiply-add costs the same as integer multiply
5108     // but is more costly than integer add, do the fusion only when
5109     // the mul is only used in the add.
5110     if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5111         !N0.getNode()->hasOneUse())
5112       return SDValue();
5113 
5114     // Do the folding
5115     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5116                        N0.getOperand(0), N0.getOperand(1), N1);
5117   }
5118   else if (N0.getOpcode() == ISD::FMUL) {
5119     if (VT == MVT::f32 || VT == MVT::f64) {
5120       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5121           &DAG.getTargetLoweringInfo());
5122       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
5123         return SDValue();
5124 
5125       // For floating point:
5126       // Do the fusion only when the mul has less than 5 uses and all
5127       // are add.
5128       // The heuristic is that if a use is not an add, then that use
5129       // cannot be fused into fma, therefore mul is still needed anyway.
5130       // If there are more than 4 uses, even if they are all add, fusing
5131       // them will increase register pressue.
5132       //
5133       int numUses = 0;
5134       int nonAddCount = 0;
5135       for (const SDNode *User : N0.getNode()->uses()) {
5136         numUses++;
5137         if (User->getOpcode() != ISD::FADD)
5138           ++nonAddCount;
5139       }
5140       if (numUses >= 5)
5141         return SDValue();
5142       if (nonAddCount) {
5143         int orderNo = N->getIROrder();
5144         int orderNo2 = N0.getNode()->getIROrder();
5145         // simple heuristics here for considering potential register
5146         // pressure, the logics here is that the differnce are used
5147         // to measure the distance between def and use, the longer distance
5148         // more likely cause register pressure.
5149         if (orderNo - orderNo2 < 500)
5150           return SDValue();
5151 
5152         // Now, check if at least one of the FMUL's operands is live beyond the node N,
5153         // which guarantees that the FMA will not increase register pressure at node N.
5154         bool opIsLive = false;
5155         const SDNode *left = N0.getOperand(0).getNode();
5156         const SDNode *right = N0.getOperand(1).getNode();
5157 
5158         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5159           opIsLive = true;
5160 
5161         if (!opIsLive)
5162           for (const SDNode *User : left->uses()) {
5163             int orderNo3 = User->getIROrder();
5164             if (orderNo3 > orderNo) {
5165               opIsLive = true;
5166               break;
5167             }
5168           }
5169 
5170         if (!opIsLive)
5171           for (const SDNode *User : right->uses()) {
5172             int orderNo3 = User->getIROrder();
5173             if (orderNo3 > orderNo) {
5174               opIsLive = true;
5175               break;
5176             }
5177           }
5178 
5179         if (!opIsLive)
5180           return SDValue();
5181       }
5182 
5183       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
5184                          N0.getOperand(0), N0.getOperand(1), N1);
5185     }
5186   }
5187 
5188   return SDValue();
5189 }
5190 
5191 static SDValue PerformStoreRetvalCombine(SDNode *N) {
5192   // Operands from the 2nd to the last one are the values to be stored
5193   for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5194     if (!N->getOperand(I).isUndef())
5195       return SDValue();
5196 
5197   // Operand 0 is the previous value in the chain. Cannot return EntryToken
5198   // as the previous value will become unused and eliminated later.
5199   return N->getOperand(0);
5200 }
5201 
5202 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5203 ///
5204 static SDValue PerformADDCombine(SDNode *N,
5205                                  TargetLowering::DAGCombinerInfo &DCI,
5206                                  const NVPTXSubtarget &Subtarget,
5207                                  CodeGenOptLevel OptLevel) {
5208   SDValue N0 = N->getOperand(0);
5209   SDValue N1 = N->getOperand(1);
5210 
5211   // First try with the default operand order.
5212   if (SDValue Result =
5213           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5214     return Result;
5215 
5216   // If that didn't work, try again with the operands commuted.
5217   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
5218 }
5219 
5220 static SDValue PerformANDCombine(SDNode *N,
5221                                  TargetLowering::DAGCombinerInfo &DCI) {
5222   // The type legalizer turns a vector load of i8 values into a zextload to i16
5223   // registers, optionally ANY_EXTENDs it (if target type is integer),
5224   // and ANDs off the high 8 bits. Since we turn this load into a
5225   // target-specific DAG node, the DAG combiner fails to eliminate these AND
5226   // nodes. Do that here.
5227   SDValue Val = N->getOperand(0);
5228   SDValue Mask = N->getOperand(1);
5229 
5230   if (isa<ConstantSDNode>(Val)) {
5231     std::swap(Val, Mask);
5232   }
5233 
5234   SDValue AExt;
5235 
5236   // Convert BFE-> truncate i16 -> and 255
5237   // To just BFE-> truncate i16, as the value already has all the bits in the
5238   // right places.
5239   if (Val.getOpcode() == ISD::TRUNCATE) {
5240     SDValue BFE = Val.getOperand(0);
5241     if (BFE.getOpcode() != NVPTXISD::BFE)
5242       return SDValue();
5243 
5244     ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5245     if (!BFEBits)
5246       return SDValue();
5247     uint64_t BFEBitsVal = BFEBits->getZExtValue();
5248 
5249     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5250     if (!MaskCnst) {
5251       // Not an AND with a constant
5252       return SDValue();
5253     }
5254     uint64_t MaskVal = MaskCnst->getZExtValue();
5255 
5256     if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5257       return SDValue();
5258     // If we get here, the AND is unnecessary.  Just replace it with the trunc
5259     DCI.CombineTo(N, Val, false);
5260   }
5261   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5262   if (Val.getOpcode() == ISD::ANY_EXTEND) {
5263     AExt = Val;
5264     Val = Val->getOperand(0);
5265   }
5266 
5267   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5268     Val = Val->getOperand(0);
5269   }
5270 
5271   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5272       Val->getOpcode() == NVPTXISD::LoadV4) {
5273     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5274     if (!MaskCnst) {
5275       // Not an AND with a constant
5276       return SDValue();
5277     }
5278 
5279     uint64_t MaskVal = MaskCnst->getZExtValue();
5280     if (MaskVal != 0xff) {
5281       // Not an AND that chops off top 8 bits
5282       return SDValue();
5283     }
5284 
5285     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5286     if (!Mem) {
5287       // Not a MemSDNode?!?
5288       return SDValue();
5289     }
5290 
5291     EVT MemVT = Mem->getMemoryVT();
5292     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5293       // We only handle the i8 case
5294       return SDValue();
5295     }
5296 
5297     unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5298     if (ExtType == ISD::SEXTLOAD) {
5299       // If for some reason the load is a sextload, the and is needed to zero
5300       // out the high 8 bits
5301       return SDValue();
5302     }
5303 
5304     bool AddTo = false;
5305     if (AExt.getNode() != nullptr) {
5306       // Re-insert the ext as a zext.
5307       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5308                             AExt.getValueType(), Val);
5309       AddTo = true;
5310     }
5311 
5312     // If we get here, the AND is unnecessary.  Just replace it with the load
5313     DCI.CombineTo(N, Val, AddTo);
5314   }
5315 
5316   return SDValue();
5317 }
5318 
5319 static SDValue PerformREMCombine(SDNode *N,
5320                                  TargetLowering::DAGCombinerInfo &DCI,
5321                                  CodeGenOptLevel OptLevel) {
5322   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5323 
5324   // Don't do anything at less than -O2.
5325   if (OptLevel < CodeGenOptLevel::Default)
5326     return SDValue();
5327 
5328   SelectionDAG &DAG = DCI.DAG;
5329   SDLoc DL(N);
5330   EVT VT = N->getValueType(0);
5331   bool IsSigned = N->getOpcode() == ISD::SREM;
5332   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5333 
5334   const SDValue &Num = N->getOperand(0);
5335   const SDValue &Den = N->getOperand(1);
5336 
5337   for (const SDNode *U : Num->uses()) {
5338     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5339         U->getOperand(1) == Den) {
5340       // Num % Den -> Num - (Num / Den) * Den
5341       return DAG.getNode(ISD::SUB, DL, VT, Num,
5342                          DAG.getNode(ISD::MUL, DL, VT,
5343                                      DAG.getNode(DivOpc, DL, VT, Num, Den),
5344                                      Den));
5345     }
5346   }
5347   return SDValue();
5348 }
5349 
5350 enum OperandSignedness {
5351   Signed = 0,
5352   Unsigned,
5353   Unknown
5354 };
5355 
5356 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5357 /// that can be demoted to \p OptSize bits without loss of information. The
5358 /// signedness of the operand, if determinable, is placed in \p S.
5359 static bool IsMulWideOperandDemotable(SDValue Op,
5360                                       unsigned OptSize,
5361                                       OperandSignedness &S) {
5362   S = Unknown;
5363 
5364   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5365       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5366     EVT OrigVT = Op.getOperand(0).getValueType();
5367     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5368       S = Signed;
5369       return true;
5370     }
5371   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5372     EVT OrigVT = Op.getOperand(0).getValueType();
5373     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5374       S = Unsigned;
5375       return true;
5376     }
5377   }
5378 
5379   return false;
5380 }
5381 
5382 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5383 /// be demoted to \p OptSize bits without loss of information. If the operands
5384 /// contain a constant, it should appear as the RHS operand. The signedness of
5385 /// the operands is placed in \p IsSigned.
5386 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5387                                         unsigned OptSize,
5388                                         bool &IsSigned) {
5389   OperandSignedness LHSSign;
5390 
5391   // The LHS operand must be a demotable op
5392   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5393     return false;
5394 
5395   // We should have been able to determine the signedness from the LHS
5396   if (LHSSign == Unknown)
5397     return false;
5398 
5399   IsSigned = (LHSSign == Signed);
5400 
5401   // The RHS can be a demotable op or a constant
5402   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5403     const APInt &Val = CI->getAPIntValue();
5404     if (LHSSign == Unsigned) {
5405       return Val.isIntN(OptSize);
5406     } else {
5407       return Val.isSignedIntN(OptSize);
5408     }
5409   } else {
5410     OperandSignedness RHSSign;
5411     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5412       return false;
5413 
5414     return LHSSign == RHSSign;
5415   }
5416 }
5417 
5418 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5419 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5420 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5421 /// amount.
5422 static SDValue TryMULWIDECombine(SDNode *N,
5423                                  TargetLowering::DAGCombinerInfo &DCI) {
5424   EVT MulType = N->getValueType(0);
5425   if (MulType != MVT::i32 && MulType != MVT::i64) {
5426     return SDValue();
5427   }
5428 
5429   SDLoc DL(N);
5430   unsigned OptSize = MulType.getSizeInBits() >> 1;
5431   SDValue LHS = N->getOperand(0);
5432   SDValue RHS = N->getOperand(1);
5433 
5434   // Canonicalize the multiply so the constant (if any) is on the right
5435   if (N->getOpcode() == ISD::MUL) {
5436     if (isa<ConstantSDNode>(LHS)) {
5437       std::swap(LHS, RHS);
5438     }
5439   }
5440 
5441   // If we have a SHL, determine the actual multiply amount
5442   if (N->getOpcode() == ISD::SHL) {
5443     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5444     if (!ShlRHS) {
5445       return SDValue();
5446     }
5447 
5448     APInt ShiftAmt = ShlRHS->getAPIntValue();
5449     unsigned BitWidth = MulType.getSizeInBits();
5450     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5451       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5452       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5453     } else {
5454       return SDValue();
5455     }
5456   }
5457 
5458   bool Signed;
5459   // Verify that our operands are demotable
5460   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5461     return SDValue();
5462   }
5463 
5464   EVT DemotedVT;
5465   if (MulType == MVT::i32) {
5466     DemotedVT = MVT::i16;
5467   } else {
5468     DemotedVT = MVT::i32;
5469   }
5470 
5471   // Truncate the operands to the correct size. Note that these are just for
5472   // type consistency and will (likely) be eliminated in later phases.
5473   SDValue TruncLHS =
5474     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5475   SDValue TruncRHS =
5476     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5477 
5478   unsigned Opc;
5479   if (Signed) {
5480     Opc = NVPTXISD::MUL_WIDE_SIGNED;
5481   } else {
5482     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5483   }
5484 
5485   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5486 }
5487 
5488 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5489 static SDValue PerformMULCombine(SDNode *N,
5490                                  TargetLowering::DAGCombinerInfo &DCI,
5491                                  CodeGenOptLevel OptLevel) {
5492   if (OptLevel > CodeGenOptLevel::None) {
5493     // Try mul.wide combining at OptLevel > 0
5494     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5495       return Ret;
5496   }
5497 
5498   return SDValue();
5499 }
5500 
5501 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5502 static SDValue PerformSHLCombine(SDNode *N,
5503                                  TargetLowering::DAGCombinerInfo &DCI,
5504                                  CodeGenOptLevel OptLevel) {
5505   if (OptLevel > CodeGenOptLevel::None) {
5506     // Try mul.wide combining at OptLevel > 0
5507     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5508       return Ret;
5509   }
5510 
5511   return SDValue();
5512 }
5513 
5514 static SDValue PerformSETCCCombine(SDNode *N,
5515                                    TargetLowering::DAGCombinerInfo &DCI,
5516                                    unsigned int SmVersion) {
5517   EVT CCType = N->getValueType(0);
5518   SDValue A = N->getOperand(0);
5519   SDValue B = N->getOperand(1);
5520 
5521   EVT AType = A.getValueType();
5522   if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5523     return SDValue();
5524 
5525   if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5526     return SDValue();
5527 
5528   SDLoc DL(N);
5529   // setp.f16x2 returns two scalar predicates, which we need to
5530   // convert back to v2i1. The returned result will be scalarized by
5531   // the legalizer, but the comparison will remain a single vector
5532   // instruction.
5533   SDValue CCNode = DCI.DAG.getNode(
5534       A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5535                                      : NVPTXISD::SETP_BF16X2,
5536       DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5537   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5538                          CCNode.getValue(1));
5539 }
5540 
5541 static SDValue PerformEXTRACTCombine(SDNode *N,
5542                                      TargetLowering::DAGCombinerInfo &DCI) {
5543   SDValue Vector = N->getOperand(0);
5544   SDLoc DL(N);
5545   EVT VectorVT = Vector.getValueType();
5546   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5547       IsPTXVectorType(VectorVT.getSimpleVT()))
5548     return SDValue(); // Native vector loads already combine nicely w/
5549                       // extract_vector_elt, except for v4i8.
5550   // Don't mess with singletons or v2*16 types, we already handle them OK.
5551   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5552       VectorVT == MVT::v4i8)
5553     return SDValue();
5554 
5555   uint64_t VectorBits = VectorVT.getSizeInBits();
5556   // We only handle the types we can extract in-register.
5557   if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5558     return SDValue();
5559 
5560   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5561   // Index == 0 is handled by generic DAG combiner.
5562   if (!Index || Index->getZExtValue() == 0)
5563     return SDValue();
5564 
5565   MVT IVT = MVT::getIntegerVT(VectorBits);
5566   EVT EltVT = VectorVT.getVectorElementType();
5567   EVT EltIVT = EltVT.changeTypeToInteger();
5568   uint64_t EltBits = EltVT.getScalarSizeInBits();
5569 
5570   SDValue Result = DCI.DAG.getNode(
5571       ISD::TRUNCATE, DL, EltIVT,
5572       DCI.DAG.getNode(
5573           ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5574           DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5575 
5576   // If element has non-integer type, bitcast it back to the expected type.
5577   if (EltVT != EltIVT)
5578     Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5579   // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5580   if (EltVT != N->getValueType(0))
5581     Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5582 
5583   return Result;
5584 }
5585 
5586 static SDValue PerformVSELECTCombine(SDNode *N,
5587                                      TargetLowering::DAGCombinerInfo &DCI) {
5588   SDValue VA = N->getOperand(1);
5589   EVT VectorVT = VA.getValueType();
5590   if (VectorVT != MVT::v4i8)
5591     return SDValue();
5592 
5593   // We need to split vselect into individual per-element operations Because we
5594   // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5595   // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5596   // to/from i16 normally used for i8 values.
5597   SmallVector<SDValue, 4> E;
5598   SDLoc DL(N);
5599   SDValue VCond = N->getOperand(0);
5600   SDValue VB = N->getOperand(2);
5601   for (int I = 0; I < 4; ++I) {
5602     SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5603                                 DCI.DAG.getConstant(I, DL, MVT::i32));
5604     SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5605         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5606                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5607         DL, MVT::i32);
5608     SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5609         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5610                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5611         DL, MVT::i32);
5612     E.push_back(DCI.DAG.getAnyExtOrTrunc(
5613         DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5614   }
5615   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5616 }
5617 
5618 static SDValue PerformLOADCombine(SDNode *N,
5619                                   TargetLowering::DAGCombinerInfo &DCI) {
5620   SelectionDAG &DAG = DCI.DAG;
5621   LoadSDNode *LD = cast<LoadSDNode>(N);
5622 
5623   // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5624   // letting ReplaceLoadVector split it into smaller loads during legalization.
5625   // This is done at dag-combine1 time, so that vector operations with i8
5626   // elements can be optimised away instead of being needlessly split during
5627   // legalization, which involves storing to the stack and loading it back.
5628   EVT VT = N->getValueType(0);
5629   if (VT != MVT::v16i8)
5630     return SDValue();
5631 
5632   SDLoc DL(N);
5633 
5634   // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5635   unsigned Opc = NVPTXISD::LoadV4;
5636   EVT NewVT = MVT::v4i32;
5637   EVT EltVT = NewVT.getVectorElementType();
5638   unsigned NumElts = NewVT.getVectorNumElements();
5639   EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5640   SDVTList RetVTList = DAG.getVTList(RetVTs);
5641   SmallVector<SDValue, 8> Ops(N->ops());
5642   Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5643   SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5644                                             LD->getMemOperand());
5645   SDValue NewChain = NewLoad.getValue(NumElts);
5646 
5647   // Create a vector of the same type returned by the original load.
5648   SmallVector<SDValue, 4> Elts;
5649   for (unsigned i = 0; i < NumElts; i++)
5650     Elts.push_back(NewLoad.getValue(i));
5651   return DCI.DAG.getMergeValues(
5652       {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5653        NewChain},
5654       DL);
5655 }
5656 
5657 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5658                                                DAGCombinerInfo &DCI) const {
5659   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
5660   switch (N->getOpcode()) {
5661     default: break;
5662     case ISD::ADD:
5663     case ISD::FADD:
5664       return PerformADDCombine(N, DCI, STI, OptLevel);
5665     case ISD::MUL:
5666       return PerformMULCombine(N, DCI, OptLevel);
5667     case ISD::SHL:
5668       return PerformSHLCombine(N, DCI, OptLevel);
5669     case ISD::AND:
5670       return PerformANDCombine(N, DCI);
5671     case ISD::UREM:
5672     case ISD::SREM:
5673       return PerformREMCombine(N, DCI, OptLevel);
5674     case ISD::SETCC:
5675       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5676     case ISD::LOAD:
5677       return PerformLOADCombine(N, DCI);
5678     case NVPTXISD::StoreRetval:
5679     case NVPTXISD::StoreRetvalV2:
5680     case NVPTXISD::StoreRetvalV4:
5681       return PerformStoreRetvalCombine(N);
5682     case ISD::EXTRACT_VECTOR_ELT:
5683       return PerformEXTRACTCombine(N, DCI);
5684     case ISD::VSELECT:
5685       return PerformVSELECTCombine(N, DCI);
5686   }
5687   return SDValue();
5688 }
5689 
5690 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5691 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
5692                               SmallVectorImpl<SDValue> &Results) {
5693   EVT ResVT = N->getValueType(0);
5694   SDLoc DL(N);
5695 
5696   assert(ResVT.isVector() && "Vector load must have vector type");
5697 
5698   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5699   // legal.  We can (and should) split that into 2 loads of <2 x double> here
5700   // but I'm leaving that as a TODO for now.
5701   assert(ResVT.isSimple() && "Can only handle simple types");
5702   switch (ResVT.getSimpleVT().SimpleTy) {
5703   default:
5704     return;
5705   case MVT::v2i8:
5706   case MVT::v2i16:
5707   case MVT::v2i32:
5708   case MVT::v2i64:
5709   case MVT::v2f16:
5710   case MVT::v2f32:
5711   case MVT::v2f64:
5712   case MVT::v4i8:
5713   case MVT::v4i16:
5714   case MVT::v4i32:
5715   case MVT::v4f16:
5716   case MVT::v4f32:
5717   case MVT::v8f16:  // <4 x f16x2>
5718   case MVT::v8bf16: // <4 x bf16x2>
5719   case MVT::v8i16:  // <4 x i16x2>
5720     // This is a "native" vector type
5721     break;
5722   }
5723 
5724   LoadSDNode *LD = cast<LoadSDNode>(N);
5725 
5726   Align Alignment = LD->getAlign();
5727   auto &TD = DAG.getDataLayout();
5728   Align PrefAlign =
5729       TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5730   if (Alignment < PrefAlign) {
5731     // This load is not sufficiently aligned, so bail out and let this vector
5732     // load be scalarized.  Note that we may still be able to emit smaller
5733     // vector loads.  For example, if we are loading a <4 x float> with an
5734     // alignment of 8, this check will fail but the legalizer will try again
5735     // with 2 x <2 x float>, which will succeed with an alignment of 8.
5736     return;
5737   }
5738 
5739   EVT EltVT = ResVT.getVectorElementType();
5740   unsigned NumElts = ResVT.getVectorNumElements();
5741 
5742   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5743   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5744   // loaded type to i16 and propagate the "real" type as the memory type.
5745   bool NeedTrunc = false;
5746   if (EltVT.getSizeInBits() < 16) {
5747     EltVT = MVT::i16;
5748     NeedTrunc = true;
5749   }
5750 
5751   unsigned Opcode = 0;
5752   SDVTList LdResVTs;
5753   bool Load16x2 = false;
5754 
5755   switch (NumElts) {
5756   default:
5757     return;
5758   case 2:
5759     Opcode = NVPTXISD::LoadV2;
5760     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5761     break;
5762   case 4: {
5763     Opcode = NVPTXISD::LoadV4;
5764     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5765     LdResVTs = DAG.getVTList(ListVTs);
5766     break;
5767   }
5768   case 8: {
5769     // v8f16 is a special case. PTX doesn't have ld.v8.f16
5770     // instruction. Instead, we split the vector into v2f16 chunks and
5771     // load them with ld.v4.b32.
5772     assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5773     Load16x2 = true;
5774     Opcode = NVPTXISD::LoadV4;
5775     EVT VVT;
5776     switch (EltVT.getSimpleVT().SimpleTy) {
5777     case MVT::f16:
5778       VVT = MVT::v2f16;
5779       break;
5780     case MVT::bf16:
5781       VVT = MVT::v2bf16;
5782       break;
5783     case MVT::i16:
5784       VVT = MVT::v2i16;
5785       break;
5786     default:
5787       llvm_unreachable("Unsupported v8 vector type.");
5788     }
5789     EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5790     LdResVTs = DAG.getVTList(ListVTs);
5791     break;
5792   }
5793   }
5794 
5795   // Copy regular operands
5796   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5797 
5798   // The select routine does not have access to the LoadSDNode instance, so
5799   // pass along the extension information
5800   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5801 
5802   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5803                                           LD->getMemoryVT(),
5804                                           LD->getMemOperand());
5805 
5806   SmallVector<SDValue, 8> ScalarRes;
5807   if (Load16x2) {
5808     // Split v2f16 subvectors back into individual elements.
5809     NumElts /= 2;
5810     for (unsigned i = 0; i < NumElts; ++i) {
5811       SDValue SubVector = NewLD.getValue(i);
5812       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5813                                DAG.getIntPtrConstant(0, DL));
5814       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
5815                                DAG.getIntPtrConstant(1, DL));
5816       ScalarRes.push_back(E0);
5817       ScalarRes.push_back(E1);
5818     }
5819   } else {
5820     for (unsigned i = 0; i < NumElts; ++i) {
5821       SDValue Res = NewLD.getValue(i);
5822       if (NeedTrunc)
5823         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5824       ScalarRes.push_back(Res);
5825     }
5826   }
5827 
5828   SDValue LoadChain = NewLD.getValue(NumElts);
5829 
5830   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5831 
5832   Results.push_back(BuildVec);
5833   Results.push_back(LoadChain);
5834 }
5835 
5836 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
5837                                      SmallVectorImpl<SDValue> &Results) {
5838   SDValue Chain = N->getOperand(0);
5839   SDValue Intrin = N->getOperand(1);
5840   SDLoc DL(N);
5841 
5842   // Get the intrinsic ID
5843   unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5844   switch (IntrinNo) {
5845   default:
5846     return;
5847   case Intrinsic::nvvm_ldg_global_i:
5848   case Intrinsic::nvvm_ldg_global_f:
5849   case Intrinsic::nvvm_ldg_global_p:
5850   case Intrinsic::nvvm_ldu_global_i:
5851   case Intrinsic::nvvm_ldu_global_f:
5852   case Intrinsic::nvvm_ldu_global_p: {
5853     EVT ResVT = N->getValueType(0);
5854 
5855     if (ResVT.isVector()) {
5856       // Vector LDG/LDU
5857 
5858       unsigned NumElts = ResVT.getVectorNumElements();
5859       EVT EltVT = ResVT.getVectorElementType();
5860 
5861       // Since LDU/LDG are target nodes, we cannot rely on DAG type
5862       // legalization.
5863       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
5864       // loaded type to i16 and propagate the "real" type as the memory type.
5865       bool NeedTrunc = false;
5866       if (EltVT.getSizeInBits() < 16) {
5867         EltVT = MVT::i16;
5868         NeedTrunc = true;
5869       }
5870 
5871       unsigned Opcode = 0;
5872       SDVTList LdResVTs;
5873 
5874       switch (NumElts) {
5875       default:
5876         return;
5877       case 2:
5878         switch (IntrinNo) {
5879         default:
5880           return;
5881         case Intrinsic::nvvm_ldg_global_i:
5882         case Intrinsic::nvvm_ldg_global_f:
5883         case Intrinsic::nvvm_ldg_global_p:
5884           Opcode = NVPTXISD::LDGV2;
5885           break;
5886         case Intrinsic::nvvm_ldu_global_i:
5887         case Intrinsic::nvvm_ldu_global_f:
5888         case Intrinsic::nvvm_ldu_global_p:
5889           Opcode = NVPTXISD::LDUV2;
5890           break;
5891         }
5892         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5893         break;
5894       case 4: {
5895         switch (IntrinNo) {
5896         default:
5897           return;
5898         case Intrinsic::nvvm_ldg_global_i:
5899         case Intrinsic::nvvm_ldg_global_f:
5900         case Intrinsic::nvvm_ldg_global_p:
5901           Opcode = NVPTXISD::LDGV4;
5902           break;
5903         case Intrinsic::nvvm_ldu_global_i:
5904         case Intrinsic::nvvm_ldu_global_f:
5905         case Intrinsic::nvvm_ldu_global_p:
5906           Opcode = NVPTXISD::LDUV4;
5907           break;
5908         }
5909         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5910         LdResVTs = DAG.getVTList(ListVTs);
5911         break;
5912       }
5913       }
5914 
5915       SmallVector<SDValue, 8> OtherOps;
5916 
5917       // Copy regular operands
5918 
5919       OtherOps.push_back(Chain); // Chain
5920                                  // Skip operand 1 (intrinsic ID)
5921       // Others
5922       OtherOps.append(N->op_begin() + 2, N->op_end());
5923 
5924       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5925 
5926       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5927                                               MemSD->getMemoryVT(),
5928                                               MemSD->getMemOperand());
5929 
5930       SmallVector<SDValue, 4> ScalarRes;
5931 
5932       for (unsigned i = 0; i < NumElts; ++i) {
5933         SDValue Res = NewLD.getValue(i);
5934         if (NeedTrunc)
5935           Res =
5936               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5937         ScalarRes.push_back(Res);
5938       }
5939 
5940       SDValue LoadChain = NewLD.getValue(NumElts);
5941 
5942       SDValue BuildVec =
5943           DAG.getBuildVector(ResVT, DL, ScalarRes);
5944 
5945       Results.push_back(BuildVec);
5946       Results.push_back(LoadChain);
5947     } else {
5948       // i8 LDG/LDU
5949       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5950              "Custom handling of non-i8 ldu/ldg?");
5951 
5952       // Just copy all operands as-is
5953       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5954 
5955       // Force output to i16
5956       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5957 
5958       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5959 
5960       // We make sure the memory type is i8, which will be used during isel
5961       // to select the proper instruction.
5962       SDValue NewLD =
5963           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5964                                   MVT::i8, MemSD->getMemOperand());
5965 
5966       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5967                                     NewLD.getValue(0)));
5968       Results.push_back(NewLD.getValue(1));
5969     }
5970   }
5971   }
5972 }
5973 
5974 void NVPTXTargetLowering::ReplaceNodeResults(
5975     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5976   switch (N->getOpcode()) {
5977   default:
5978     report_fatal_error("Unhandled custom legalization");
5979   case ISD::LOAD:
5980     ReplaceLoadVector(N, DAG, Results);
5981     return;
5982   case ISD::INTRINSIC_W_CHAIN:
5983     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5984     return;
5985   }
5986 }
5987 
5988 NVPTXTargetLowering::AtomicExpansionKind
5989 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
5990   Type *Ty = AI->getValOperand()->getType();
5991 
5992   if (AI->isFloatingPointOperation()) {
5993     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
5994       if (Ty->isFloatTy())
5995         return AtomicExpansionKind::None;
5996       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5997         return AtomicExpansionKind::None;
5998     }
5999     return AtomicExpansionKind::CmpXChg;
6000   }
6001 
6002   assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6003   auto ITy = cast<llvm::IntegerType>(Ty);
6004 
6005   switch (AI->getOperation()) {
6006   default:
6007     return AtomicExpansionKind::CmpXChg;
6008   case AtomicRMWInst::BinOp::And:
6009   case AtomicRMWInst::BinOp::Or:
6010   case AtomicRMWInst::BinOp::Xor:
6011   case AtomicRMWInst::BinOp::Xchg:
6012     switch (ITy->getBitWidth()) {
6013     case 8:
6014     case 16:
6015       return AtomicExpansionKind::CmpXChg;
6016     case 32:
6017       return AtomicExpansionKind::None;
6018     case 64:
6019       if (STI.hasAtomBitwise64())
6020         return AtomicExpansionKind::None;
6021       return AtomicExpansionKind::CmpXChg;
6022     default:
6023       llvm_unreachable("unsupported width encountered");
6024     }
6025   case AtomicRMWInst::BinOp::Add:
6026   case AtomicRMWInst::BinOp::Sub:
6027   case AtomicRMWInst::BinOp::Max:
6028   case AtomicRMWInst::BinOp::Min:
6029   case AtomicRMWInst::BinOp::UMax:
6030   case AtomicRMWInst::BinOp::UMin:
6031     switch (ITy->getBitWidth()) {
6032     case 8:
6033     case 16:
6034       return AtomicExpansionKind::CmpXChg;
6035     case 32:
6036       return AtomicExpansionKind::None;
6037     case 64:
6038       if (STI.hasAtomMinMax64())
6039         return AtomicExpansionKind::None;
6040       return AtomicExpansionKind::CmpXChg;
6041     default:
6042       llvm_unreachable("unsupported width encountered");
6043     }
6044   }
6045 
6046   return AtomicExpansionKind::CmpXChg;
6047 }
6048 
6049 // Pin NVPTXTargetObjectFile's vtables to this file.
6050 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6051 
6052 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6053     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6054   return getDataSection();
6055 }
6056