xref: /freebsd/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
15 #include "MCTargetDesc/NVPTXBaseInfo.h"
16 #include "NVPTX.h"
17 #include "NVPTXSubtarget.h"
18 #include "NVPTXTargetMachine.h"
19 #include "NVPTXTargetObjectFile.h"
20 #include "NVPTXUtilities.h"
21 #include "llvm/ADT/APInt.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/ISDOpcodes.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineMemOperand.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/SelectionDAGNodes.h"
31 #include "llvm/CodeGen/TargetCallingConv.h"
32 #include "llvm/CodeGen/TargetLowering.h"
33 #include "llvm/CodeGen/ValueTypes.h"
34 #include "llvm/CodeGenTypes/MachineValueType.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/DataLayout.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/FPEnv.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalValue.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicsNVPTX.h"
47 #include "llvm/IR/Module.h"
48 #include "llvm/IR/Type.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Alignment.h"
51 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/CodeGen.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Target/TargetMachine.h"
57 #include "llvm/Target/TargetOptions.h"
58 #include <algorithm>
59 #include <cassert>
60 #include <cmath>
61 #include <cstdint>
62 #include <iterator>
63 #include <optional>
64 #include <sstream>
65 #include <string>
66 #include <utility>
67 #include <vector>
68 
69 #define DEBUG_TYPE "nvptx-lower"
70 
71 using namespace llvm;
72 
73 static std::atomic<unsigned> GlobalUniqueCallSite;
74 
75 static cl::opt<bool> sched4reg(
76     "nvptx-sched4reg",
77     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
78 
79 static cl::opt<unsigned> FMAContractLevelOpt(
80     "nvptx-fma-level", cl::Hidden,
81     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82              " 1: do it  2: do it aggressively"),
83     cl::init(2));
84 
85 static cl::opt<int> UsePrecDivF32(
86     "nvptx-prec-divf32", cl::Hidden,
87     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88              " IEEE Compliant F32 div.rnd if available."),
89     cl::init(2));
90 
91 static cl::opt<bool> UsePrecSqrtF32(
92     "nvptx-prec-sqrtf32", cl::Hidden,
93     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94     cl::init(true));
95 
96 static cl::opt<bool> ForceMinByValParamAlign(
97     "nvptx-force-min-byval-param-align", cl::Hidden,
98     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99              " params of device functions."),
100     cl::init(false));
101 
102 int NVPTXTargetLowering::getDivF32Level() const {
103   if (UsePrecDivF32.getNumOccurrences() > 0) {
104     // If nvptx-prec-div32=N is used on the command-line, always honor it
105     return UsePrecDivF32;
106   } else {
107     // Otherwise, use div.approx if fast math is enabled
108     if (getTargetMachine().Options.UnsafeFPMath)
109       return 0;
110     else
111       return 2;
112   }
113 }
114 
115 bool NVPTXTargetLowering::usePrecSqrtF32() const {
116   if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118     return UsePrecSqrtF32;
119   } else {
120     // Otherwise, use sqrt.approx if fast math is enabled
121     return !getTargetMachine().Options.UnsafeFPMath;
122   }
123 }
124 
125 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
126   return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
127          DenormalMode::PreserveSign;
128 }
129 
130 static bool IsPTXVectorType(MVT VT) {
131   switch (VT.SimpleTy) {
132   default:
133     return false;
134   case MVT::v2i1:
135   case MVT::v4i1:
136   case MVT::v2i8:
137   case MVT::v4i8:
138   case MVT::v2i16:
139   case MVT::v4i16:
140   case MVT::v8i16: // <4 x i16x2>
141   case MVT::v2i32:
142   case MVT::v4i32:
143   case MVT::v2i64:
144   case MVT::v2f16:
145   case MVT::v4f16:
146   case MVT::v8f16: // <4 x f16x2>
147   case MVT::v2bf16:
148   case MVT::v4bf16:
149   case MVT::v8bf16: // <4 x bf16x2>
150   case MVT::v2f32:
151   case MVT::v4f32:
152   case MVT::v2f64:
153     return true;
154   }
155 }
156 
157 static bool Is16bitsType(MVT VT) {
158   return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159           VT.SimpleTy == MVT::i16);
160 }
161 
162 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
164 /// into their primitive components.
165 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167 /// LowerCall, and LowerReturn.
168 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
171                                uint64_t StartingOffset = 0) {
172   SmallVector<EVT, 16> TempVTs;
173   SmallVector<uint64_t, 16> TempOffsets;
174 
175   // Special case for i128 - decompose to (i64, i64)
176   if (Ty->isIntegerTy(128)) {
177     ValueVTs.push_back(EVT(MVT::i64));
178     ValueVTs.push_back(EVT(MVT::i64));
179 
180     if (Offsets) {
181       Offsets->push_back(StartingOffset + 0);
182       Offsets->push_back(StartingOffset + 8);
183     }
184 
185     return;
186   }
187 
188   // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189   if (StructType *STy = dyn_cast<StructType>(Ty)) {
190     auto const *SL = DL.getStructLayout(STy);
191     auto ElementNum = 0;
192     for(auto *EI : STy->elements()) {
193       ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
194                          StartingOffset + SL->getElementOffset(ElementNum));
195       ++ElementNum;
196     }
197     return;
198   }
199 
200   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
201   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202     EVT VT = TempVTs[i];
203     uint64_t Off = TempOffsets[i];
204     // Split vectors into individual elements, except for v2f16, which
205     // we will pass as a single scalar.
206     if (VT.isVector()) {
207       unsigned NumElts = VT.getVectorNumElements();
208       EVT EltVT = VT.getVectorElementType();
209       // Vectors with an even number of f16 elements will be passed to
210       // us as an array of v2f16/v2bf16 elements. We must match this so we
211       // stay in sync with Ins/Outs.
212       if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213         switch (EltVT.getSimpleVT().SimpleTy) {
214         case MVT::f16:
215           EltVT = MVT::v2f16;
216           break;
217         case MVT::bf16:
218           EltVT = MVT::v2bf16;
219           break;
220         case MVT::i16:
221           EltVT = MVT::v2i16;
222           break;
223         default:
224           llvm_unreachable("Unexpected type");
225         }
226         NumElts /= 2;
227       } else if (EltVT.getSimpleVT() == MVT::i8 &&
228                  (NumElts % 4 == 0 || NumElts == 3)) {
229         // v*i8 are formally lowered as v4i8
230         EltVT = MVT::v4i8;
231         NumElts = (NumElts + 3) / 4;
232       } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
233         // v2i8 is promoted to v2i16
234         NumElts = 1;
235         EltVT = MVT::v2i16;
236       }
237       for (unsigned j = 0; j != NumElts; ++j) {
238         ValueVTs.push_back(EltVT);
239         if (Offsets)
240           Offsets->push_back(Off + j * EltVT.getStoreSize());
241       }
242     } else {
243       ValueVTs.push_back(VT);
244       if (Offsets)
245         Offsets->push_back(Off);
246     }
247   }
248 }
249 
250 /// PromoteScalarIntegerPTX
251 /// Used to make sure the arguments/returns are suitable for passing
252 /// and promote them to a larger size if they're not.
253 ///
254 /// The promoted type is placed in \p PromoteVT if the function returns true.
255 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
256   if (VT.isScalarInteger()) {
257     switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
258     default:
259       llvm_unreachable(
260           "Promotion is not suitable for scalars of size larger than 64-bits");
261     case 1:
262       *PromotedVT = MVT::i1;
263       break;
264     case 2:
265     case 4:
266     case 8:
267       *PromotedVT = MVT::i8;
268       break;
269     case 16:
270       *PromotedVT = MVT::i16;
271       break;
272     case 32:
273       *PromotedVT = MVT::i32;
274       break;
275     case 64:
276       *PromotedVT = MVT::i64;
277       break;
278     }
279     return EVT(*PromotedVT) != VT;
280   }
281   return false;
282 }
283 
284 // Check whether we can merge loads/stores of some of the pieces of a
285 // flattened function parameter or return value into a single vector
286 // load/store.
287 //
288 // The flattened parameter is represented as a list of EVTs and
289 // offsets, and the whole structure is aligned to ParamAlignment. This
290 // function determines whether we can load/store pieces of the
291 // parameter starting at index Idx using a single vectorized op of
292 // size AccessSize. If so, it returns the number of param pieces
293 // covered by the vector op. Otherwise, it returns 1.
294 static unsigned CanMergeParamLoadStoresStartingAt(
295     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
296     const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
297 
298   // Can't vectorize if param alignment is not sufficient.
299   if (ParamAlignment < AccessSize)
300     return 1;
301   // Can't vectorize if offset is not aligned.
302   if (Offsets[Idx] & (AccessSize - 1))
303     return 1;
304 
305   EVT EltVT = ValueVTs[Idx];
306   unsigned EltSize = EltVT.getStoreSize();
307 
308   // Element is too large to vectorize.
309   if (EltSize >= AccessSize)
310     return 1;
311 
312   unsigned NumElts = AccessSize / EltSize;
313   // Can't vectorize if AccessBytes if not a multiple of EltSize.
314   if (AccessSize != EltSize * NumElts)
315     return 1;
316 
317   // We don't have enough elements to vectorize.
318   if (Idx + NumElts > ValueVTs.size())
319     return 1;
320 
321   // PTX ISA can only deal with 2- and 4-element vector ops.
322   if (NumElts != 4 && NumElts != 2)
323     return 1;
324 
325   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
326     // Types do not match.
327     if (ValueVTs[j] != EltVT)
328       return 1;
329 
330     // Elements are not contiguous.
331     if (Offsets[j] - Offsets[j - 1] != EltSize)
332       return 1;
333   }
334   // OK. We can vectorize ValueVTs[i..i+NumElts)
335   return NumElts;
336 }
337 
338 // Flags for tracking per-element vectorization state of loads/stores
339 // of a flattened function parameter or return value.
340 enum ParamVectorizationFlags {
341   PVF_INNER = 0x0, // Middle elements of a vector.
342   PVF_FIRST = 0x1, // First element of the vector.
343   PVF_LAST = 0x2,  // Last element of the vector.
344   // Scalar is effectively a 1-element vector.
345   PVF_SCALAR = PVF_FIRST | PVF_LAST
346 };
347 
348 // Computes whether and how we can vectorize the loads/stores of a
349 // flattened function parameter or return value.
350 //
351 // The flattened parameter is represented as the list of ValueVTs and
352 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
353 // of the same size as ValueVTs indicating how each piece should be
354 // loaded/stored (i.e. as a scalar, or as part of a vector
355 // load/store).
356 static SmallVector<ParamVectorizationFlags, 16>
357 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
358                      const SmallVectorImpl<uint64_t> &Offsets,
359                      Align ParamAlignment, bool IsVAArg = false) {
360   // Set vector size to match ValueVTs and mark all elements as
361   // scalars by default.
362   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
363   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
364 
365   if (IsVAArg)
366     return VectorInfo;
367 
368   // Check what we can vectorize using 128/64/32-bit accesses.
369   for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
370     // Skip elements we've already processed.
371     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
372     for (unsigned AccessSize : {16, 8, 4, 2}) {
373       unsigned NumElts = CanMergeParamLoadStoresStartingAt(
374           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
375       // Mark vectorized elements.
376       switch (NumElts) {
377       default:
378         llvm_unreachable("Unexpected return value");
379       case 1:
380         // Can't vectorize using this size, try next smaller size.
381         continue;
382       case 2:
383         assert(I + 1 < E && "Not enough elements.");
384         VectorInfo[I] = PVF_FIRST;
385         VectorInfo[I + 1] = PVF_LAST;
386         I += 1;
387         break;
388       case 4:
389         assert(I + 3 < E && "Not enough elements.");
390         VectorInfo[I] = PVF_FIRST;
391         VectorInfo[I + 1] = PVF_INNER;
392         VectorInfo[I + 2] = PVF_INNER;
393         VectorInfo[I + 3] = PVF_LAST;
394         I += 3;
395         break;
396       }
397       // Break out of the inner loop because we've already succeeded
398       // using largest possible AccessSize.
399       break;
400     }
401   }
402   return VectorInfo;
403 }
404 
405 // NVPTXTargetLowering Constructor.
406 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
407                                          const NVPTXSubtarget &STI)
408     : TargetLowering(TM), nvTM(&TM), STI(STI) {
409   // always lower memset, memcpy, and memmove intrinsics to load/store
410   // instructions, rather
411   // then generating calls to memset, mempcy or memmove.
412   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
413   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
414   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
415 
416   setBooleanContents(ZeroOrNegativeOneBooleanContent);
417   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
418 
419   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
420   // condition branches.
421   setJumpIsExpensive(true);
422 
423   // Wide divides are _very_ slow. Try to reduce the width of the divide if
424   // possible.
425   addBypassSlowDiv(64, 32);
426 
427   // By default, use the Source scheduling
428   if (sched4reg)
429     setSchedulingPreference(Sched::RegPressure);
430   else
431     setSchedulingPreference(Sched::Source);
432 
433   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
434                                     LegalizeAction NoF16Action) {
435     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
436   };
437 
438   auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
439                                     LegalizeAction NoBF16Action) {
440     bool IsOpSupported = STI.hasBF16Math();
441     // Few instructions are available on sm_90 only
442     switch(Op) {
443       case ISD::FADD:
444       case ISD::FMUL:
445       case ISD::FSUB:
446       case ISD::SELECT:
447       case ISD::SELECT_CC:
448       case ISD::SETCC:
449       case ISD::FEXP2:
450       case ISD::FCEIL:
451       case ISD::FFLOOR:
452       case ISD::FNEARBYINT:
453       case ISD::FRINT:
454       case ISD::FROUNDEVEN:
455       case ISD::FTRUNC:
456         IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
457         break;
458     }
459     setOperationAction(
460         Op, VT, IsOpSupported ? Action : NoBF16Action);
461   };
462 
463   auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
464                                      LegalizeAction NoI16x2Action) {
465     bool IsOpSupported = false;
466     // instructions are available on sm_90 only
467     switch (Op) {
468     case ISD::ADD:
469     case ISD::SMAX:
470     case ISD::SMIN:
471     case ISD::UMIN:
472     case ISD::UMAX:
473       IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
474       break;
475     }
476     setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
477   };
478 
479   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
480   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
481   addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
482   addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
483   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
484   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
485   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
486   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
487   addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
488   addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
489   addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
490   addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
491 
492   // Conversion to/from FP16/FP16x2 is always legal.
493   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
494   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
495   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
496   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
497 
498   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
499   if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
500     setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
501 
502   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
503   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
504 
505   // Conversion to/from BFP16/BFP16x2 is always legal.
506   setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
507   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
508   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
509   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
510 
511   setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
512   setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
513   if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
514     AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
515 
516   // Conversion to/from i16/i16x2 is always legal.
517   setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
518   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
519   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
520   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
521 
522   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
523   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
524   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
525   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
526   // Only logical ops can be done on v4i8 directly, others must be done
527   // elementwise.
528   setOperationAction(
529       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
530        ISD::BITREVERSE,  ISD::CTLZ,       ISD::CTPOP,       ISD::CTTZ,
531        ISD::FP_TO_SINT,  ISD::FP_TO_UINT, ISD::FSHL,        ISD::FSHR,
532        ISD::MUL,         ISD::MULHS,      ISD::MULHU,       ISD::PARITY,
533        ISD::ROTL,        ISD::ROTR,       ISD::SADDO,       ISD::SADDO_CARRY,
534        ISD::SADDSAT,     ISD::SDIV,       ISD::SDIVREM,     ISD::SELECT_CC,
535        ISD::SETCC,       ISD::SHL,        ISD::SINT_TO_FP,  ISD::SMAX,
536        ISD::SMIN,        ISD::SMULO,      ISD::SMUL_LOHI,   ISD::SRA,
537        ISD::SREM,        ISD::SRL,        ISD::SSHLSAT,     ISD::SSUBO,
538        ISD::SSUBO_CARRY, ISD::SSUBSAT,    ISD::SUB,         ISD::SUBC,
539        ISD::SUBE,        ISD::UADDO,      ISD::UADDO_CARRY, ISD::UADDSAT,
540        ISD::UDIV,        ISD::UDIVREM,    ISD::UINT_TO_FP,  ISD::UMAX,
541        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
542        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
543        ISD::USUBSAT},
544       MVT::v4i8, Expand);
545 
546   // Operations not directly supported by NVPTX.
547   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
548                  MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
549                  MVT::i32, MVT::i64}) {
550     setOperationAction(ISD::SELECT_CC, VT, Expand);
551     setOperationAction(ISD::BR_CC, VT, Expand);
552   }
553 
554   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
555   // For others we will expand to a SHL/SRA pair.
556   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
557   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
558   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
559   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
560   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
561   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
562 
563   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
564   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
565   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
566   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
567   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
568   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
569 
570   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
571   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
572 
573   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
574   // that don't have h/w rotation we lower them to multi-instruction assembly.
575   // See ROT*_sw in NVPTXIntrInfo.td
576   setOperationAction(ISD::ROTL, MVT::i64, Legal);
577   setOperationAction(ISD::ROTR, MVT::i64, Legal);
578   setOperationAction(ISD::ROTL, MVT::i32, Legal);
579   setOperationAction(ISD::ROTR, MVT::i32, Legal);
580 
581   setOperationAction(ISD::ROTL, MVT::i16, Expand);
582   setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
583   setOperationAction(ISD::ROTR, MVT::i16, Expand);
584   setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
585   setOperationAction(ISD::ROTL, MVT::i8, Expand);
586   setOperationAction(ISD::ROTR, MVT::i8, Expand);
587   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
588 
589   // Indirect branch is not supported.
590   // This also disables Jump Table creation.
591   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
592   setOperationAction(ISD::BRIND, MVT::Other, Expand);
593 
594   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
595   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
596 
597   // We want to legalize constant related memmove and memcopy
598   // intrinsics.
599   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
600 
601   // Turn FP extload into load/fpextend
602   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
603   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
604   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
605   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
606   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
607   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
608   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
609   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
610   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
611   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
612   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
613   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
614   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
615   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
616   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
617   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
618   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
619   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
620   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
621   // Turn FP truncstore into trunc + store.
622   // FIXME: vector types should also be expanded
623   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
624   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
625   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
626   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
627   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
628 
629   // PTX does not support load / store predicate registers
630   setOperationAction(ISD::LOAD, MVT::i1, Custom);
631   setOperationAction(ISD::STORE, MVT::i1, Custom);
632 
633   for (MVT VT : MVT::integer_valuetypes()) {
634     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
635     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
636     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
637     setTruncStoreAction(VT, MVT::i1, Expand);
638   }
639 
640   // expand extload of vector of integers.
641   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
642                    MVT::v2i8, Expand);
643   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
644 
645   // This is legal in NVPTX
646   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
647   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
648   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
649   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
650 
651   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
652   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
653 
654   // TRAP can be lowered to PTX trap
655   setOperationAction(ISD::TRAP, MVT::Other, Legal);
656 
657   // Register custom handling for vector loads/stores
658   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
659     if (IsPTXVectorType(VT)) {
660       setOperationAction(ISD::LOAD, VT, Custom);
661       setOperationAction(ISD::STORE, VT, Custom);
662       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
663     }
664   }
665 
666   // Support varargs.
667   setOperationAction(ISD::VASTART, MVT::Other, Custom);
668   setOperationAction(ISD::VAARG, MVT::Other, Custom);
669   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
670   setOperationAction(ISD::VAEND, MVT::Other, Expand);
671 
672   // Custom handling for i8 intrinsics
673   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
674 
675   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
676     setOperationAction(ISD::ABS,  Ty, Legal);
677     setOperationAction(ISD::SMIN, Ty, Legal);
678     setOperationAction(ISD::SMAX, Ty, Legal);
679     setOperationAction(ISD::UMIN, Ty, Legal);
680     setOperationAction(ISD::UMAX, Ty, Legal);
681 
682     setOperationAction(ISD::CTPOP, Ty, Legal);
683     setOperationAction(ISD::CTLZ, Ty, Legal);
684   }
685 
686   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
687   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
688   setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
689   setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
690   setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
691   setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
692   setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
693 
694   setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
695   setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
696   setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
697   setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
698   setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
699   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
700 
701   // Other arithmetic and logic ops are unsupported.
702   setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
703                       ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
704                       ISD::SINT_TO_FP, ISD::UINT_TO_FP},
705                      MVT::v2i16, Expand);
706 
707   setOperationAction(ISD::ADDC, MVT::i32, Legal);
708   setOperationAction(ISD::ADDE, MVT::i32, Legal);
709   setOperationAction(ISD::SUBC, MVT::i32, Legal);
710   setOperationAction(ISD::SUBE, MVT::i32, Legal);
711   if (STI.getPTXVersion() >= 43) {
712     setOperationAction(ISD::ADDC, MVT::i64, Legal);
713     setOperationAction(ISD::ADDE, MVT::i64, Legal);
714     setOperationAction(ISD::SUBC, MVT::i64, Legal);
715     setOperationAction(ISD::SUBE, MVT::i64, Legal);
716   }
717 
718   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
719   setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
720   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
721   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
722 
723   // PTX does not directly support SELP of i1, so promote to i32 first
724   setOperationAction(ISD::SELECT, MVT::i1, Custom);
725 
726   // PTX cannot multiply two i64s in a single instruction.
727   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
728   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
729 
730   // We have some custom DAG combine patterns for these nodes
731   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
732                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
733                        ISD::VSELECT});
734 
735   // setcc for f16x2 and bf16x2 needs special handling to prevent
736   // legalizer's attempt to scalarize it due to v2i1 not being legal.
737   if (STI.allowFP16Math() || STI.hasBF16Math())
738     setTargetDAGCombine(ISD::SETCC);
739 
740   // Promote fp16 arithmetic if fp16 hardware isn't available or the
741   // user passed --nvptx-no-fp16-math. The flag is useful because,
742   // although sm_53+ GPUs have some sort of FP16 support in
743   // hardware, only sm_53 and sm_60 have full implementation. Others
744   // only have token amount of hardware and are likely to run faster
745   // by using fp32 units instead.
746   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
747     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
748     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
749     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
750     // bf16 must be promoted to f32.
751     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
752     if (getOperationAction(Op, MVT::bf16) == Promote)
753       AddPromotedToType(Op, MVT::bf16, MVT::f32);
754   }
755 
756   // f16/f16x2 neg was introduced in PTX 60, SM_53.
757   const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
758                                         STI.getPTXVersion() >= 60 &&
759                                         STI.allowFP16Math();
760   for (const auto &VT : {MVT::f16, MVT::v2f16})
761     setOperationAction(ISD::FNEG, VT,
762                        IsFP16FP16x2NegAvailable ? Legal : Expand);
763 
764   setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
765   setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
766   // (would be) Library functions.
767 
768   // These map to conversion instructions for scalar FP types.
769   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
770                          ISD::FROUNDEVEN, ISD::FTRUNC}) {
771     setOperationAction(Op, MVT::f16, Legal);
772     setOperationAction(Op, MVT::f32, Legal);
773     setOperationAction(Op, MVT::f64, Legal);
774     setOperationAction(Op, MVT::v2f16, Expand);
775     setOperationAction(Op, MVT::v2bf16, Expand);
776     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
777     if (getOperationAction(Op, MVT::bf16) == Promote)
778       AddPromotedToType(Op, MVT::bf16, MVT::f32);
779   }
780 
781   if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
782     setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
783   }
784   if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
785     for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
786       setOperationAction(ISD::FP_EXTEND, VT, Custom);
787       setOperationAction(ISD::FP_ROUND, VT, Custom);
788     }
789   }
790 
791   // sm_80 only has conversions between f32 and bf16. Custom lower all other
792   // bf16 conversions.
793   if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
794     for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
795       setOperationAction(
796           {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
797           VT, Custom);
798     }
799     setOperationAction(
800         {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
801         MVT::bf16, Custom);
802   }
803 
804   setOperationAction(ISD::FROUND, MVT::f16, Promote);
805   setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
806   setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
807   setOperationAction(ISD::FROUND, MVT::f32, Custom);
808   setOperationAction(ISD::FROUND, MVT::f64, Custom);
809   setOperationAction(ISD::FROUND, MVT::bf16, Promote);
810   AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
811 
812   // 'Expand' implements FCOPYSIGN without calling an external library.
813   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
814   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
815   setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
816   setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
817   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
818   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
819 
820   // These map to corresponding instructions for f32/f64. f16 must be
821   // promoted to f32. v2f16 is expanded to f16, which is then promoted
822   // to f32.
823   for (const auto &Op :
824        {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
825     setOperationAction(Op, MVT::f16, Promote);
826     setOperationAction(Op, MVT::f32, Legal);
827     setOperationAction(Op, MVT::f64, Legal);
828     setOperationAction(Op, MVT::v2f16, Expand);
829     setOperationAction(Op, MVT::v2bf16, Expand);
830     setOperationAction(Op, MVT::bf16, Promote);
831     AddPromotedToType(Op, MVT::bf16, MVT::f32);
832   }
833   for (const auto &Op : {ISD::FABS}) {
834     setOperationAction(Op, MVT::f16, Promote);
835     setOperationAction(Op, MVT::f32, Legal);
836     setOperationAction(Op, MVT::f64, Legal);
837     setOperationAction(Op, MVT::v2f16, Expand);
838     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
839     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
840     if (getOperationAction(Op, MVT::bf16) == Promote)
841       AddPromotedToType(Op, MVT::bf16, MVT::f32);
842   }
843 
844   // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
845   auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
846     bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
847     return IsAtLeastSm80 ? Legal : NotSm80Action;
848   };
849   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
850     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
851     setOperationAction(Op, MVT::f32, Legal);
852     setOperationAction(Op, MVT::f64, Legal);
853     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
854     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
855     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
856     if (getOperationAction(Op, MVT::bf16) == Promote)
857       AddPromotedToType(Op, MVT::bf16, MVT::f32);
858   }
859   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
860     setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
861     setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
862     setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
863     setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
864     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
865   }
866 
867   // Custom lowering for inline asm with 128-bit operands
868   setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
869   setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
870 
871   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
872   // No FPOW or FREM in PTX.
873 
874   // Now deduce the information based on the above mentioned
875   // actions
876   computeRegisterProperties(STI.getRegisterInfo());
877 
878   setMinCmpXchgSizeInBits(32);
879   setMaxAtomicSizeInBitsSupported(64);
880   setMaxDivRemBitWidthSupported(64);
881 }
882 
883 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
884 
885 #define MAKE_CASE(V)                                                           \
886   case V:                                                                      \
887     return #V;
888 
889   switch ((NVPTXISD::NodeType)Opcode) {
890   case NVPTXISD::FIRST_NUMBER:
891     break;
892 
893     MAKE_CASE(NVPTXISD::CALL)
894     MAKE_CASE(NVPTXISD::RET_GLUE)
895     MAKE_CASE(NVPTXISD::LOAD_PARAM)
896     MAKE_CASE(NVPTXISD::Wrapper)
897     MAKE_CASE(NVPTXISD::DeclareParam)
898     MAKE_CASE(NVPTXISD::DeclareScalarParam)
899     MAKE_CASE(NVPTXISD::DeclareRet)
900     MAKE_CASE(NVPTXISD::DeclareScalarRet)
901     MAKE_CASE(NVPTXISD::DeclareRetParam)
902     MAKE_CASE(NVPTXISD::PrintCall)
903     MAKE_CASE(NVPTXISD::PrintConvergentCall)
904     MAKE_CASE(NVPTXISD::PrintCallUni)
905     MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
906     MAKE_CASE(NVPTXISD::LoadParam)
907     MAKE_CASE(NVPTXISD::LoadParamV2)
908     MAKE_CASE(NVPTXISD::LoadParamV4)
909     MAKE_CASE(NVPTXISD::StoreParam)
910     MAKE_CASE(NVPTXISD::StoreParamV2)
911     MAKE_CASE(NVPTXISD::StoreParamV4)
912     MAKE_CASE(NVPTXISD::StoreParamS32)
913     MAKE_CASE(NVPTXISD::StoreParamU32)
914     MAKE_CASE(NVPTXISD::CallArgBegin)
915     MAKE_CASE(NVPTXISD::CallArg)
916     MAKE_CASE(NVPTXISD::LastCallArg)
917     MAKE_CASE(NVPTXISD::CallArgEnd)
918     MAKE_CASE(NVPTXISD::CallVoid)
919     MAKE_CASE(NVPTXISD::CallVal)
920     MAKE_CASE(NVPTXISD::CallSymbol)
921     MAKE_CASE(NVPTXISD::Prototype)
922     MAKE_CASE(NVPTXISD::MoveParam)
923     MAKE_CASE(NVPTXISD::StoreRetval)
924     MAKE_CASE(NVPTXISD::StoreRetvalV2)
925     MAKE_CASE(NVPTXISD::StoreRetvalV4)
926     MAKE_CASE(NVPTXISD::PseudoUseParam)
927     MAKE_CASE(NVPTXISD::RETURN)
928     MAKE_CASE(NVPTXISD::CallSeqBegin)
929     MAKE_CASE(NVPTXISD::CallSeqEnd)
930     MAKE_CASE(NVPTXISD::CallPrototype)
931     MAKE_CASE(NVPTXISD::ProxyReg)
932     MAKE_CASE(NVPTXISD::LoadV2)
933     MAKE_CASE(NVPTXISD::LoadV4)
934     MAKE_CASE(NVPTXISD::LDGV2)
935     MAKE_CASE(NVPTXISD::LDGV4)
936     MAKE_CASE(NVPTXISD::LDUV2)
937     MAKE_CASE(NVPTXISD::LDUV4)
938     MAKE_CASE(NVPTXISD::StoreV2)
939     MAKE_CASE(NVPTXISD::StoreV4)
940     MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
941     MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
942     MAKE_CASE(NVPTXISD::IMAD)
943     MAKE_CASE(NVPTXISD::BFE)
944     MAKE_CASE(NVPTXISD::BFI)
945     MAKE_CASE(NVPTXISD::PRMT)
946     MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
947     MAKE_CASE(NVPTXISD::SETP_F16X2)
948     MAKE_CASE(NVPTXISD::SETP_BF16X2)
949     MAKE_CASE(NVPTXISD::Dummy)
950     MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
951     MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
952     MAKE_CASE(NVPTXISD::Tex1DFloatS32)
953     MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
954     MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
955     MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)
956     MAKE_CASE(NVPTXISD::Tex1DS32S32)
957     MAKE_CASE(NVPTXISD::Tex1DS32Float)
958     MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)
959     MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)
960     MAKE_CASE(NVPTXISD::Tex1DU32S32)
961     MAKE_CASE(NVPTXISD::Tex1DU32Float)
962     MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)
963     MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)
964     MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)
965     MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)
966     MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)
967     MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)
968     MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)
969     MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)
970     MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)
971     MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)
972     MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)
973     MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)
974     MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)
975     MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)
976     MAKE_CASE(NVPTXISD::Tex2DFloatS32)
977     MAKE_CASE(NVPTXISD::Tex2DFloatFloat)
978     MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)
979     MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)
980     MAKE_CASE(NVPTXISD::Tex2DS32S32)
981     MAKE_CASE(NVPTXISD::Tex2DS32Float)
982     MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)
983     MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)
984     MAKE_CASE(NVPTXISD::Tex2DU32S32)
985     MAKE_CASE(NVPTXISD::Tex2DU32Float)
986     MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)
987     MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)
988     MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)
989     MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)
990     MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)
991     MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)
992     MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)
993     MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)
994     MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)
995     MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)
996     MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)
997     MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)
998     MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)
999     MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)
1000     MAKE_CASE(NVPTXISD::Tex3DFloatS32)
1001     MAKE_CASE(NVPTXISD::Tex3DFloatFloat)
1002     MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)
1003     MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)
1004     MAKE_CASE(NVPTXISD::Tex3DS32S32)
1005     MAKE_CASE(NVPTXISD::Tex3DS32Float)
1006     MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)
1007     MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)
1008     MAKE_CASE(NVPTXISD::Tex3DU32S32)
1009     MAKE_CASE(NVPTXISD::Tex3DU32Float)
1010     MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)
1011     MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)
1012     MAKE_CASE(NVPTXISD::TexCubeFloatFloat)
1013     MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)
1014     MAKE_CASE(NVPTXISD::TexCubeS32Float)
1015     MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)
1016     MAKE_CASE(NVPTXISD::TexCubeU32Float)
1017     MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)
1018     MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)
1019     MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)
1020     MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)
1021     MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)
1022     MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)
1023     MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)
1024     MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)
1025     MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)
1026     MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)
1027     MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)
1028     MAKE_CASE(NVPTXISD::Tld4R2DS64Float)
1029     MAKE_CASE(NVPTXISD::Tld4G2DS64Float)
1030     MAKE_CASE(NVPTXISD::Tld4B2DS64Float)
1031     MAKE_CASE(NVPTXISD::Tld4A2DS64Float)
1032     MAKE_CASE(NVPTXISD::Tld4R2DU64Float)
1033     MAKE_CASE(NVPTXISD::Tld4G2DU64Float)
1034     MAKE_CASE(NVPTXISD::Tld4B2DU64Float)
1035     MAKE_CASE(NVPTXISD::Tld4A2DU64Float)
1036 
1037     MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)
1038     MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)
1039     MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)
1040     MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)
1041     MAKE_CASE(NVPTXISD::TexUnified1DS32S32)
1042     MAKE_CASE(NVPTXISD::TexUnified1DS32Float)
1043     MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)
1044     MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)
1045     MAKE_CASE(NVPTXISD::TexUnified1DU32S32)
1046     MAKE_CASE(NVPTXISD::TexUnified1DU32Float)
1047     MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)
1048     MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)
1049     MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)
1050     MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)
1051     MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)
1052     MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)
1053     MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)
1054     MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)
1055     MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)
1056     MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)
1057     MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)
1058     MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)
1059     MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)
1060     MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)
1061     MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)
1062     MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)
1063     MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)
1064     MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)
1065     MAKE_CASE(NVPTXISD::TexUnified2DS32S32)
1066     MAKE_CASE(NVPTXISD::TexUnified2DS32Float)
1067     MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)
1068     MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)
1069     MAKE_CASE(NVPTXISD::TexUnified2DU32S32)
1070     MAKE_CASE(NVPTXISD::TexUnified2DU32Float)
1071     MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)
1072     MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)
1073     MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)
1074     MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)
1075     MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)
1076     MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)
1077     MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)
1078     MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)
1079     MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)
1080     MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)
1081     MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)
1082     MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)
1083     MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)
1084     MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)
1085     MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)
1086     MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)
1087     MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)
1088     MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)
1089     MAKE_CASE(NVPTXISD::TexUnified3DS32S32)
1090     MAKE_CASE(NVPTXISD::TexUnified3DS32Float)
1091     MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)
1092     MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)
1093     MAKE_CASE(NVPTXISD::TexUnified3DU32S32)
1094     MAKE_CASE(NVPTXISD::TexUnified3DU32Float)
1095     MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)
1096     MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)
1097     MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)
1098     MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)
1099     MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)
1100     MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)
1101     MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)
1102     MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)
1103     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)
1104     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)
1105     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)
1106     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)
1107     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)
1108     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)
1109     MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)
1110     MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)
1111     MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)
1112     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)
1113     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)
1114     MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)
1115     MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)
1116     MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)
1117     MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)
1118     MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)
1119     MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)
1120     MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)
1121     MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)
1122     MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)
1123     MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)
1124     MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)
1125     MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)
1126     MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)
1127 
1128     MAKE_CASE(NVPTXISD::Suld1DI8Clamp)
1129     MAKE_CASE(NVPTXISD::Suld1DI16Clamp)
1130     MAKE_CASE(NVPTXISD::Suld1DI32Clamp)
1131     MAKE_CASE(NVPTXISD::Suld1DI64Clamp)
1132     MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)
1133     MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)
1134     MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)
1135     MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)
1136     MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)
1137     MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)
1138     MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)
1139 
1140     MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)
1141     MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)
1142     MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)
1143     MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)
1144     MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)
1145     MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)
1146     MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)
1147     MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)
1148     MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)
1149     MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)
1150     MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)
1151 
1152     MAKE_CASE(NVPTXISD::Suld2DI8Clamp)
1153     MAKE_CASE(NVPTXISD::Suld2DI16Clamp)
1154     MAKE_CASE(NVPTXISD::Suld2DI32Clamp)
1155     MAKE_CASE(NVPTXISD::Suld2DI64Clamp)
1156     MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)
1157     MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)
1158     MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)
1159     MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)
1160     MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)
1161     MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)
1162     MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)
1163 
1164     MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)
1165     MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)
1166     MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)
1167     MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)
1168     MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)
1169     MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)
1170     MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)
1171     MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)
1172     MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)
1173     MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)
1174     MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)
1175 
1176     MAKE_CASE(NVPTXISD::Suld3DI8Clamp)
1177     MAKE_CASE(NVPTXISD::Suld3DI16Clamp)
1178     MAKE_CASE(NVPTXISD::Suld3DI32Clamp)
1179     MAKE_CASE(NVPTXISD::Suld3DI64Clamp)
1180     MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)
1181     MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)
1182     MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)
1183     MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)
1184     MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)
1185     MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)
1186     MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)
1187 
1188     MAKE_CASE(NVPTXISD::Suld1DI8Trap)
1189     MAKE_CASE(NVPTXISD::Suld1DI16Trap)
1190     MAKE_CASE(NVPTXISD::Suld1DI32Trap)
1191     MAKE_CASE(NVPTXISD::Suld1DI64Trap)
1192     MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)
1193     MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)
1194     MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)
1195     MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)
1196     MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)
1197     MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)
1198     MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)
1199 
1200     MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)
1201     MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)
1202     MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)
1203     MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)
1204     MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)
1205     MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)
1206     MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)
1207     MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)
1208     MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)
1209     MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)
1210     MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)
1211 
1212     MAKE_CASE(NVPTXISD::Suld2DI8Trap)
1213     MAKE_CASE(NVPTXISD::Suld2DI16Trap)
1214     MAKE_CASE(NVPTXISD::Suld2DI32Trap)
1215     MAKE_CASE(NVPTXISD::Suld2DI64Trap)
1216     MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)
1217     MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)
1218     MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)
1219     MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)
1220     MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)
1221     MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)
1222     MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)
1223 
1224     MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)
1225     MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)
1226     MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)
1227     MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)
1228     MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)
1229     MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)
1230     MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)
1231     MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)
1232     MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)
1233     MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)
1234     MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)
1235 
1236     MAKE_CASE(NVPTXISD::Suld3DI8Trap)
1237     MAKE_CASE(NVPTXISD::Suld3DI16Trap)
1238     MAKE_CASE(NVPTXISD::Suld3DI32Trap)
1239     MAKE_CASE(NVPTXISD::Suld3DI64Trap)
1240     MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)
1241     MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)
1242     MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)
1243     MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)
1244     MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)
1245     MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)
1246     MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)
1247 
1248     MAKE_CASE(NVPTXISD::Suld1DI8Zero)
1249     MAKE_CASE(NVPTXISD::Suld1DI16Zero)
1250     MAKE_CASE(NVPTXISD::Suld1DI32Zero)
1251     MAKE_CASE(NVPTXISD::Suld1DI64Zero)
1252     MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)
1253     MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)
1254     MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)
1255     MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)
1256     MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)
1257     MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)
1258     MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)
1259 
1260     MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)
1261     MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)
1262     MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)
1263     MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)
1264     MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)
1265     MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)
1266     MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)
1267     MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)
1268     MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)
1269     MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)
1270     MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)
1271 
1272     MAKE_CASE(NVPTXISD::Suld2DI8Zero)
1273     MAKE_CASE(NVPTXISD::Suld2DI16Zero)
1274     MAKE_CASE(NVPTXISD::Suld2DI32Zero)
1275     MAKE_CASE(NVPTXISD::Suld2DI64Zero)
1276     MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)
1277     MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)
1278     MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)
1279     MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)
1280     MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)
1281     MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)
1282     MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)
1283 
1284     MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)
1285     MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)
1286     MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)
1287     MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)
1288     MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)
1289     MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)
1290     MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)
1291     MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)
1292     MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)
1293     MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)
1294     MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)
1295 
1296     MAKE_CASE(NVPTXISD::Suld3DI8Zero)
1297     MAKE_CASE(NVPTXISD::Suld3DI16Zero)
1298     MAKE_CASE(NVPTXISD::Suld3DI32Zero)
1299     MAKE_CASE(NVPTXISD::Suld3DI64Zero)
1300     MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)
1301     MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)
1302     MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)
1303     MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)
1304     MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)
1305     MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)
1306     MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)
1307   }
1308   return nullptr;
1309 
1310 #undef MAKE_CASE
1311 }
1312 
1313 TargetLoweringBase::LegalizeTypeAction
1314 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1315   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1316       VT.getScalarType() == MVT::i1)
1317     return TypeSplitVector;
1318   if (Isv2x16VT(VT))
1319     return TypeLegal;
1320   return TargetLoweringBase::getPreferredVectorAction(VT);
1321 }
1322 
1323 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1324                                              int Enabled, int &ExtraSteps,
1325                                              bool &UseOneConst,
1326                                              bool Reciprocal) const {
1327   if (!(Enabled == ReciprocalEstimate::Enabled ||
1328         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1329     return SDValue();
1330 
1331   if (ExtraSteps == ReciprocalEstimate::Unspecified)
1332     ExtraSteps = 0;
1333 
1334   SDLoc DL(Operand);
1335   EVT VT = Operand.getValueType();
1336   bool Ftz = useF32FTZ(DAG.getMachineFunction());
1337 
1338   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1339     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1340                        DAG.getConstant(IID, DL, MVT::i32), Operand);
1341   };
1342 
1343   // The sqrt and rsqrt refinement processes assume we always start out with an
1344   // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1345   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1346   // any refinement, we must return a regular sqrt.
1347   if (Reciprocal || ExtraSteps > 0) {
1348     if (VT == MVT::f32)
1349       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1350                                    : Intrinsic::nvvm_rsqrt_approx_f);
1351     else if (VT == MVT::f64)
1352       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1353     else
1354       return SDValue();
1355   } else {
1356     if (VT == MVT::f32)
1357       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1358                                    : Intrinsic::nvvm_sqrt_approx_f);
1359     else {
1360       // There's no sqrt.approx.f64 instruction, so we emit
1361       // reciprocal(rsqrt(x)).  This is faster than
1362       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1363       // x * rsqrt(x).)
1364       return DAG.getNode(
1365           ISD::INTRINSIC_WO_CHAIN, DL, VT,
1366           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1367           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1368     }
1369   }
1370 }
1371 
1372 SDValue
1373 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1374   SDLoc dl(Op);
1375   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1376   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1377   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1378   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1379 }
1380 
1381 static bool IsTypePassedAsArray(const Type *Ty) {
1382   return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1383          Ty->isHalfTy() || Ty->isBFloatTy();
1384 }
1385 
1386 std::string NVPTXTargetLowering::getPrototype(
1387     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1388     const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1389     std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1390     const CallBase &CB, unsigned UniqueCallSite) const {
1391   auto PtrVT = getPointerTy(DL);
1392 
1393   bool isABI = (STI.getSmVersion() >= 20);
1394   assert(isABI && "Non-ABI compilation is not supported");
1395   if (!isABI)
1396     return "";
1397 
1398   std::string Prototype;
1399   raw_string_ostream O(Prototype);
1400   O << "prototype_" << UniqueCallSite << " : .callprototype ";
1401 
1402   if (retTy->getTypeID() == Type::VoidTyID) {
1403     O << "()";
1404   } else {
1405     O << "(";
1406     if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1407         !IsTypePassedAsArray(retTy)) {
1408       unsigned size = 0;
1409       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1410         size = ITy->getBitWidth();
1411       } else {
1412         assert(retTy->isFloatingPointTy() &&
1413                "Floating point type expected here");
1414         size = retTy->getPrimitiveSizeInBits();
1415       }
1416       // PTX ABI requires all scalar return values to be at least 32
1417       // bits in size.  fp16 normally uses .b16 as its storage type in
1418       // PTX, so its size must be adjusted here, too.
1419       size = promoteScalarArgumentSize(size);
1420 
1421       O << ".param .b" << size << " _";
1422     } else if (isa<PointerType>(retTy)) {
1423       O << ".param .b" << PtrVT.getSizeInBits() << " _";
1424     } else if (IsTypePassedAsArray(retTy)) {
1425       O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1426         << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1427     } else {
1428       llvm_unreachable("Unknown return type");
1429     }
1430     O << ") ";
1431   }
1432   O << "_ (";
1433 
1434   bool first = true;
1435 
1436   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1437   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1438     Type *Ty = Args[i].Ty;
1439     if (!first) {
1440       O << ", ";
1441     }
1442     first = false;
1443 
1444     if (!Outs[OIdx].Flags.isByVal()) {
1445       if (IsTypePassedAsArray(Ty)) {
1446         Align ParamAlign =
1447             getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1448         O << ".param .align " << ParamAlign.value() << " .b8 ";
1449         O << "_";
1450         O << "[" << DL.getTypeAllocSize(Ty) << "]";
1451         // update the index for Outs
1452         SmallVector<EVT, 16> vtparts;
1453         ComputeValueVTs(*this, DL, Ty, vtparts);
1454         if (unsigned len = vtparts.size())
1455           OIdx += len - 1;
1456         continue;
1457       }
1458       // i8 types in IR will be i16 types in SDAG
1459       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1460               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1461              "type mismatch between callee prototype and arguments");
1462       // scalar type
1463       unsigned sz = 0;
1464       if (isa<IntegerType>(Ty)) {
1465         sz = cast<IntegerType>(Ty)->getBitWidth();
1466         sz = promoteScalarArgumentSize(sz);
1467       } else if (isa<PointerType>(Ty)) {
1468         sz = PtrVT.getSizeInBits();
1469       } else {
1470         sz = Ty->getPrimitiveSizeInBits();
1471       }
1472       O << ".param .b" << sz << " ";
1473       O << "_";
1474       continue;
1475     }
1476 
1477     // Indirect calls need strict ABI alignment so we disable optimizations by
1478     // not providing a function to optimize.
1479     Type *ETy = Args[i].IndirectType;
1480     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1481     Align ParamByValAlign =
1482         getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1483 
1484     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1485     O << "_";
1486     O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1487   }
1488 
1489   if (VAInfo)
1490     O << (first ? "" : ",") << " .param .align " << VAInfo->second
1491       << " .b8 _[]\n";
1492   O << ")";
1493   if (shouldEmitPTXNoReturn(&CB, *nvTM))
1494     O << " .noreturn";
1495   O << ";";
1496 
1497   return Prototype;
1498 }
1499 
1500 Align NVPTXTargetLowering::getFunctionArgumentAlignment(
1501     const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1502   return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1503 }
1504 
1505 Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1506                                                 unsigned Idx,
1507                                                 const DataLayout &DL) const {
1508   if (!CB) {
1509     // CallSite is zero, fallback to ABI type alignment
1510     return DL.getABITypeAlign(Ty);
1511   }
1512 
1513   const Function *DirectCallee = CB->getCalledFunction();
1514 
1515   if (!DirectCallee) {
1516     // We don't have a direct function symbol, but that may be because of
1517     // constant cast instructions in the call.
1518 
1519     // With bitcast'd call targets, the instruction will be the call
1520     if (const auto *CI = dyn_cast<CallInst>(CB)) {
1521       // Check if we have call alignment metadata
1522       if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1523         return StackAlign.value();
1524     }
1525     DirectCallee = getMaybeBitcastedCallee(CB);
1526   }
1527 
1528   // Check for function alignment information if we found that the
1529   // ultimate target is a Function
1530   if (DirectCallee)
1531     return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1532 
1533   // Call is indirect, fall back to the ABI type alignment
1534   return DL.getABITypeAlign(Ty);
1535 }
1536 
1537 static bool adjustElementType(EVT &ElementType) {
1538   switch (ElementType.getSimpleVT().SimpleTy) {
1539   default:
1540     return false;
1541   case MVT::f16:
1542   case MVT::bf16:
1543     ElementType = MVT::i16;
1544     return true;
1545   case MVT::f32:
1546   case MVT::v2f16:
1547   case MVT::v2bf16:
1548     ElementType = MVT::i32;
1549     return true;
1550   case MVT::f64:
1551     ElementType = MVT::i64;
1552     return true;
1553   }
1554 }
1555 
1556 // Use byte-store when the param address of the argument value is unaligned.
1557 // This may happen when the return value is a field of a packed structure.
1558 //
1559 // This is called in LowerCall() when passing the param values.
1560 static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
1561                                         uint64_t Offset, EVT ElementType,
1562                                         SDValue StVal, SDValue &InGlue,
1563                                         unsigned ArgID, const SDLoc &dl) {
1564   // Bit logic only works on integer types
1565   if (adjustElementType(ElementType))
1566     StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1567 
1568   // Store each byte
1569   SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1570   for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1571     // Shift the byte to the last byte position
1572     SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1573                                    DAG.getConstant(i * 8, dl, MVT::i32));
1574     SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1575                                DAG.getConstant(Offset + i, dl, MVT::i32),
1576                                ShiftVal, InGlue};
1577     // Trunc store only the last byte by using
1578     //     st.param.b8
1579     // The register type can be larger than b8.
1580     Chain = DAG.getMemIntrinsicNode(
1581         NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1582         MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
1583     InGlue = Chain.getValue(1);
1584   }
1585   return Chain;
1586 }
1587 
1588 // Use byte-load when the param adress of the returned value is unaligned.
1589 // This may happen when the returned value is a field of a packed structure.
1590 static SDValue
1591 LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
1592                            EVT ElementType, SDValue &InGlue,
1593                            SmallVectorImpl<SDValue> &TempProxyRegOps,
1594                            const SDLoc &dl) {
1595   // Bit logic only works on integer types
1596   EVT MergedType = ElementType;
1597   adjustElementType(MergedType);
1598 
1599   // Load each byte and construct the whole value. Initial value to 0
1600   SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1601   // LoadParamMemI8 loads into i16 register only
1602   SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1603   for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1604     SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1605                               DAG.getConstant(Offset + i, dl, MVT::i32),
1606                               InGlue};
1607     // This will be selected to LoadParamMemI8
1608     SDValue LdVal =
1609         DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1610                                 MVT::i8, MachinePointerInfo(), Align(1));
1611     SDValue TmpLdVal = LdVal.getValue(0);
1612     Chain = LdVal.getValue(1);
1613     InGlue = LdVal.getValue(2);
1614 
1615     TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1616                            TmpLdVal.getSimpleValueType(), TmpLdVal);
1617     TempProxyRegOps.push_back(TmpLdVal);
1618 
1619     SDValue CMask = DAG.getConstant(255, dl, MergedType);
1620     SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1621     // Need to extend the i16 register to the whole width.
1622     TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1623     // Mask off the high bits. Leave only the lower 8bits.
1624     // Do this because we are using loadparam.b8.
1625     TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1626     // Shift and merge
1627     TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1628     RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1629   }
1630   if (ElementType != MergedType)
1631     RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1632 
1633   return RetVal;
1634 }
1635 
1636 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1637                                        SmallVectorImpl<SDValue> &InVals) const {
1638 
1639   if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1640     report_fatal_error(
1641         "Support for variadic functions (unsized array parameter) introduced "
1642         "in PTX ISA version 6.0 and requires target sm_30.");
1643 
1644   SelectionDAG &DAG = CLI.DAG;
1645   SDLoc dl = CLI.DL;
1646   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1647   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1648   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1649   SDValue Chain = CLI.Chain;
1650   SDValue Callee = CLI.Callee;
1651   bool &isTailCall = CLI.IsTailCall;
1652   ArgListTy &Args = CLI.getArgs();
1653   Type *RetTy = CLI.RetTy;
1654   const CallBase *CB = CLI.CB;
1655   const DataLayout &DL = DAG.getDataLayout();
1656 
1657   bool isABI = (STI.getSmVersion() >= 20);
1658   assert(isABI && "Non-ABI compilation is not supported");
1659   if (!isABI)
1660     return Chain;
1661 
1662   // Variadic arguments.
1663   //
1664   // Normally, for each argument, we declare a param scalar or a param
1665   // byte array in the .param space, and store the argument value to that
1666   // param scalar or array starting at offset 0.
1667   //
1668   // In the case of the first variadic argument, we declare a vararg byte array
1669   // with size 0. The exact size of this array isn't known at this point, so
1670   // it'll be patched later. All the variadic arguments will be stored to this
1671   // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1672   // initially set to 0, so it can be used for non-variadic arguments (which use
1673   // 0 offset) to simplify the code.
1674   //
1675   // After all vararg is processed, 'VAOffset' holds the size of the
1676   // vararg byte array.
1677 
1678   SDValue VADeclareParam;                 // vararg byte array
1679   unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1680   unsigned VAOffset = 0;                  // current offset in the param array
1681 
1682   unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1683   SDValue TempChain = Chain;
1684   Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1685   SDValue InGlue = Chain.getValue(1);
1686 
1687   unsigned ParamCount = 0;
1688   // Args.size() and Outs.size() need not match.
1689   // Outs.size() will be larger
1690   //   * if there is an aggregate argument with multiple fields (each field
1691   //     showing up separately in Outs)
1692   //   * if there is a vector argument with more than typical vector-length
1693   //     elements (generally if more than 4) where each vector element is
1694   //     individually present in Outs.
1695   // So a different index should be used for indexing into Outs/OutVals.
1696   // See similar issue in LowerFormalArguments.
1697   unsigned OIdx = 0;
1698   // Declare the .params or .reg need to pass values
1699   // to the function
1700   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1701     EVT VT = Outs[OIdx].VT;
1702     Type *Ty = Args[i].Ty;
1703     bool IsVAArg = (i >= CLI.NumFixedArgs);
1704     bool IsByVal = Outs[OIdx].Flags.isByVal();
1705 
1706     SmallVector<EVT, 16> VTs;
1707     SmallVector<uint64_t, 16> Offsets;
1708 
1709     assert((!IsByVal || Args[i].IndirectType) &&
1710            "byval arg must have indirect type");
1711     Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1712     ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1713 
1714     Align ArgAlign;
1715     if (IsByVal) {
1716       // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1717       // so we don't need to worry whether it's naturally aligned or not.
1718       // See TargetLowering::LowerCallTo().
1719       Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1720       ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1721                                             InitialAlign, DL);
1722       if (IsVAArg)
1723         VAOffset = alignTo(VAOffset, ArgAlign);
1724     } else {
1725       ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1726     }
1727 
1728     unsigned TypeSize =
1729         (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1730     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1731 
1732     bool NeedAlign; // Does argument declaration specify alignment?
1733     bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1734     if (IsVAArg) {
1735       if (ParamCount == FirstVAArg) {
1736         SDValue DeclareParamOps[] = {
1737             Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1738             DAG.getConstant(ParamCount, dl, MVT::i32),
1739             DAG.getConstant(1, dl, MVT::i32), InGlue};
1740         VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1741                                              DeclareParamVTs, DeclareParamOps);
1742       }
1743       NeedAlign = PassAsArray;
1744     } else if (PassAsArray) {
1745       // declare .param .align <align> .b8 .param<n>[<size>];
1746       SDValue DeclareParamOps[] = {
1747           Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1748           DAG.getConstant(ParamCount, dl, MVT::i32),
1749           DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1750       Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1751                           DeclareParamOps);
1752       NeedAlign = true;
1753     } else {
1754       // declare .param .b<size> .param<n>;
1755       if (VT.isInteger() || VT.isFloatingPoint()) {
1756         // PTX ABI requires integral types to be at least 32 bits in
1757         // size. FP16 is loaded/stored using i16, so it's handled
1758         // here as well.
1759         TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1760       }
1761       SDValue DeclareScalarParamOps[] = {
1762           Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1763           DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1764           DAG.getConstant(0, dl, MVT::i32), InGlue};
1765       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1766                           DeclareScalarParamOps);
1767       NeedAlign = false;
1768     }
1769     InGlue = Chain.getValue(1);
1770 
1771     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1772     // than 32-bits are sign extended or zero extended, depending on
1773     // whether they are signed or unsigned types. This case applies
1774     // only to scalar parameters and not to aggregate values.
1775     bool ExtendIntegerParam =
1776         Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1777 
1778     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1779     SmallVector<SDValue, 6> StoreOperands;
1780     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1781       EVT EltVT = VTs[j];
1782       int CurOffset = Offsets[j];
1783       MaybeAlign PartAlign;
1784       if (NeedAlign)
1785         PartAlign = commonAlignment(ArgAlign, CurOffset);
1786 
1787       SDValue StVal = OutVals[OIdx];
1788 
1789       MVT PromotedVT;
1790       if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1791         EltVT = EVT(PromotedVT);
1792       }
1793       if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1794         llvm::ISD::NodeType Ext =
1795             Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1796         StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1797       }
1798 
1799       if (IsByVal) {
1800         auto PtrVT = getPointerTy(DL);
1801         SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1802                                       DAG.getConstant(CurOffset, dl, PtrVT));
1803         StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1804                             PartAlign);
1805       } else if (ExtendIntegerParam) {
1806         assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1807         // zext/sext to i32
1808         StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1809                                                       : ISD::ZERO_EXTEND,
1810                             dl, MVT::i32, StVal);
1811       }
1812 
1813       if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1814         // Use 16-bit registers for small stores as it's the
1815         // smallest general purpose register size supported by NVPTX.
1816         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1817       }
1818 
1819       // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1820       // scalar store. In such cases, fall back to byte stores.
1821       if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1822           PartAlign.value() <
1823               DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1824         assert(StoreOperands.empty() && "Unfinished preceeding store.");
1825         Chain = LowerUnalignedStoreParam(
1826             DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1827             StVal, InGlue, ParamCount, dl);
1828 
1829         // LowerUnalignedStoreParam took care of inserting the necessary nodes
1830         // into the SDAG, so just move on to the next element.
1831         if (!IsByVal)
1832           ++OIdx;
1833         continue;
1834       }
1835 
1836       // New store.
1837       if (VectorInfo[j] & PVF_FIRST) {
1838         assert(StoreOperands.empty() && "Unfinished preceding store.");
1839         StoreOperands.push_back(Chain);
1840         StoreOperands.push_back(
1841             DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1842 
1843         StoreOperands.push_back(DAG.getConstant(
1844             IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1845             dl, MVT::i32));
1846       }
1847 
1848       // Record the value to store.
1849       StoreOperands.push_back(StVal);
1850 
1851       if (VectorInfo[j] & PVF_LAST) {
1852         unsigned NumElts = StoreOperands.size() - 3;
1853         NVPTXISD::NodeType Op;
1854         switch (NumElts) {
1855         case 1:
1856           Op = NVPTXISD::StoreParam;
1857           break;
1858         case 2:
1859           Op = NVPTXISD::StoreParamV2;
1860           break;
1861         case 4:
1862           Op = NVPTXISD::StoreParamV4;
1863           break;
1864         default:
1865           llvm_unreachable("Invalid vector info.");
1866         }
1867 
1868         StoreOperands.push_back(InGlue);
1869 
1870         // Adjust type of the store op if we've extended the scalar
1871         // return value.
1872         EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1873 
1874         Chain = DAG.getMemIntrinsicNode(
1875             Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1876             TheStoreType, MachinePointerInfo(), PartAlign,
1877             MachineMemOperand::MOStore);
1878         InGlue = Chain.getValue(1);
1879 
1880         // Cleanup.
1881         StoreOperands.clear();
1882 
1883         // TODO: We may need to support vector types that can be passed
1884         // as scalars in variadic arguments.
1885         if (!IsByVal && IsVAArg) {
1886           assert(NumElts == 1 &&
1887                  "Vectorization is expected to be disabled for variadics.");
1888           VAOffset += DL.getTypeAllocSize(
1889               TheStoreType.getTypeForEVT(*DAG.getContext()));
1890         }
1891       }
1892       if (!IsByVal)
1893         ++OIdx;
1894     }
1895     assert(StoreOperands.empty() && "Unfinished parameter store.");
1896     if (!IsByVal && VTs.size() > 0)
1897       --OIdx;
1898     ++ParamCount;
1899     if (IsByVal && IsVAArg)
1900       VAOffset += TypeSize;
1901   }
1902 
1903   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1904   MaybeAlign retAlignment = std::nullopt;
1905 
1906   // Handle Result
1907   if (Ins.size() > 0) {
1908     SmallVector<EVT, 16> resvtparts;
1909     ComputeValueVTs(*this, DL, RetTy, resvtparts);
1910 
1911     // Declare
1912     //  .param .align N .b8 retval0[<size-in-bytes>], or
1913     //  .param .b<size-in-bits> retval0
1914     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1915     if (!IsTypePassedAsArray(RetTy)) {
1916       resultsz = promoteScalarArgumentSize(resultsz);
1917       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1918       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1919                                   DAG.getConstant(resultsz, dl, MVT::i32),
1920                                   DAG.getConstant(0, dl, MVT::i32), InGlue };
1921       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1922                           DeclareRetOps);
1923       InGlue = Chain.getValue(1);
1924     } else {
1925       retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1926       assert(retAlignment && "retAlignment is guaranteed to be set");
1927       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1928       SDValue DeclareRetOps[] = {
1929           Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1930           DAG.getConstant(resultsz / 8, dl, MVT::i32),
1931           DAG.getConstant(0, dl, MVT::i32), InGlue};
1932       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1933                           DeclareRetOps);
1934       InGlue = Chain.getValue(1);
1935     }
1936   }
1937 
1938   bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1939   // Set the size of the vararg param byte array if the callee is a variadic
1940   // function and the variadic part is not empty.
1941   if (HasVAArgs) {
1942     SDValue DeclareParamOps[] = {
1943         VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1944         VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1945         VADeclareParam.getOperand(4)};
1946     DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1947                     VADeclareParam->getVTList(), DeclareParamOps);
1948   }
1949 
1950   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1951   // between them we must rely on the call site value which is valid for
1952   // indirect calls but is always null for libcalls.
1953   bool isIndirectCall = !Func && CB;
1954 
1955   if (isa<ExternalSymbolSDNode>(Callee)) {
1956     Function* CalleeFunc = nullptr;
1957 
1958     // Try to find the callee in the current module.
1959     Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1960     assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1961 
1962     // Set the "libcall callee" attribute to indicate that the function
1963     // must always have a declaration.
1964     CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1965   }
1966 
1967   if (isIndirectCall) {
1968     // This is indirect function call case : PTX requires a prototype of the
1969     // form
1970     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1971     // to be emitted, and the label has to used as the last arg of call
1972     // instruction.
1973     // The prototype is embedded in a string and put as the operand for a
1974     // CallPrototype SDNode which will print out to the value of the string.
1975     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1976     std::string Proto = getPrototype(
1977         DL, RetTy, Args, Outs, retAlignment,
1978         HasVAArgs
1979             ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1980                   CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1981             : std::nullopt,
1982         *CB, UniqueCallSite);
1983     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1984     SDValue ProtoOps[] = {
1985         Chain,
1986         DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1987         InGlue,
1988     };
1989     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1990     InGlue = Chain.getValue(1);
1991   }
1992   // Op to just print "call"
1993   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1994   SDValue PrintCallOps[] = {
1995     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1996   };
1997   // We model convergent calls as separate opcodes.
1998   unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1999   if (CLI.IsConvergent)
2000     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2001                                               : NVPTXISD::PrintConvergentCall;
2002   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2003   InGlue = Chain.getValue(1);
2004 
2005   // Ops to print out the function name
2006   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2007   SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2008   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2009   InGlue = Chain.getValue(1);
2010 
2011   // Ops to print out the param list
2012   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2013   SDValue CallArgBeginOps[] = { Chain, InGlue };
2014   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2015                       CallArgBeginOps);
2016   InGlue = Chain.getValue(1);
2017 
2018   for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2019        ++i) {
2020     unsigned opcode;
2021     if (i == (e - 1))
2022       opcode = NVPTXISD::LastCallArg;
2023     else
2024       opcode = NVPTXISD::CallArg;
2025     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2026     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2027                              DAG.getConstant(i, dl, MVT::i32), InGlue };
2028     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2029     InGlue = Chain.getValue(1);
2030   }
2031   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2032   SDValue CallArgEndOps[] = { Chain,
2033                               DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2034                               InGlue };
2035   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2036   InGlue = Chain.getValue(1);
2037 
2038   if (isIndirectCall) {
2039     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2040     SDValue PrototypeOps[] = {
2041         Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2042     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2043     InGlue = Chain.getValue(1);
2044   }
2045 
2046   SmallVector<SDValue, 16> ProxyRegOps;
2047   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2048   // An item of the vector is filled if the element does not need a ProxyReg
2049   // operation on it and should be added to InVals as is. ProxyRegOps and
2050   // ProxyRegTruncates contain empty/none items at the same index.
2051   SmallVector<SDValue, 16> RetElts;
2052   // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2053   // to use the values of `LoadParam`s and to be replaced later then
2054   // `CALLSEQ_END` is added.
2055   SmallVector<SDValue, 16> TempProxyRegOps;
2056 
2057   // Generate loads from param memory/moves from registers for result
2058   if (Ins.size() > 0) {
2059     SmallVector<EVT, 16> VTs;
2060     SmallVector<uint64_t, 16> Offsets;
2061     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2062     assert(VTs.size() == Ins.size() && "Bad value decomposition");
2063 
2064     Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2065     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2066 
2067     SmallVector<EVT, 6> LoadVTs;
2068     int VecIdx = -1; // Index of the first element of the vector.
2069 
2070     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2071     // 32-bits are sign extended or zero extended, depending on whether
2072     // they are signed or unsigned types.
2073     bool ExtendIntegerRetVal =
2074         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2075 
2076     for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2077       bool needTruncate = false;
2078       EVT TheLoadType = VTs[i];
2079       EVT EltType = Ins[i].VT;
2080       Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2081       MVT PromotedVT;
2082 
2083       if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2084         TheLoadType = EVT(PromotedVT);
2085         EltType = EVT(PromotedVT);
2086         needTruncate = true;
2087       }
2088 
2089       if (ExtendIntegerRetVal) {
2090         TheLoadType = MVT::i32;
2091         EltType = MVT::i32;
2092         needTruncate = true;
2093       } else if (TheLoadType.getSizeInBits() < 16) {
2094         if (VTs[i].isInteger())
2095           needTruncate = true;
2096         EltType = MVT::i16;
2097       }
2098 
2099       // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2100       // scalar load. In such cases, fall back to byte loads.
2101       if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2102           EltAlign < DL.getABITypeAlign(
2103                          TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2104         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2105         SDValue Ret = LowerUnalignedLoadRetParam(
2106             DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2107         ProxyRegOps.push_back(SDValue());
2108         ProxyRegTruncates.push_back(std::optional<MVT>());
2109         RetElts.resize(i);
2110         RetElts.push_back(Ret);
2111 
2112         continue;
2113       }
2114 
2115       // Record index of the very first element of the vector.
2116       if (VectorInfo[i] & PVF_FIRST) {
2117         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2118         VecIdx = i;
2119       }
2120 
2121       LoadVTs.push_back(EltType);
2122 
2123       if (VectorInfo[i] & PVF_LAST) {
2124         unsigned NumElts = LoadVTs.size();
2125         LoadVTs.push_back(MVT::Other);
2126         LoadVTs.push_back(MVT::Glue);
2127         NVPTXISD::NodeType Op;
2128         switch (NumElts) {
2129         case 1:
2130           Op = NVPTXISD::LoadParam;
2131           break;
2132         case 2:
2133           Op = NVPTXISD::LoadParamV2;
2134           break;
2135         case 4:
2136           Op = NVPTXISD::LoadParamV4;
2137           break;
2138         default:
2139           llvm_unreachable("Invalid vector info.");
2140         }
2141 
2142         SDValue LoadOperands[] = {
2143             Chain, DAG.getConstant(1, dl, MVT::i32),
2144             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2145         SDValue RetVal = DAG.getMemIntrinsicNode(
2146             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2147             MachinePointerInfo(), EltAlign,
2148             MachineMemOperand::MOLoad);
2149 
2150         for (unsigned j = 0; j < NumElts; ++j) {
2151           ProxyRegOps.push_back(RetVal.getValue(j));
2152 
2153           if (needTruncate)
2154             ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2155           else
2156             ProxyRegTruncates.push_back(std::optional<MVT>());
2157         }
2158 
2159         Chain = RetVal.getValue(NumElts);
2160         InGlue = RetVal.getValue(NumElts + 1);
2161 
2162         // Cleanup
2163         VecIdx = -1;
2164         LoadVTs.clear();
2165       }
2166     }
2167   }
2168 
2169   Chain =
2170       DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2171   InGlue = Chain.getValue(1);
2172 
2173   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2174   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2175   // dangling.
2176   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2177     if (i < RetElts.size() && RetElts[i]) {
2178       InVals.push_back(RetElts[i]);
2179       continue;
2180     }
2181 
2182     SDValue Ret = DAG.getNode(
2183       NVPTXISD::ProxyReg, dl,
2184       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2185       { Chain, ProxyRegOps[i], InGlue }
2186     );
2187 
2188     Chain = Ret.getValue(1);
2189     InGlue = Ret.getValue(2);
2190 
2191     if (ProxyRegTruncates[i]) {
2192       Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2193     }
2194 
2195     InVals.push_back(Ret);
2196   }
2197 
2198   for (SDValue &T : TempProxyRegOps) {
2199     SDValue Repl = DAG.getNode(
2200         NVPTXISD::ProxyReg, dl,
2201         DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2202         {Chain, T.getOperand(0), InGlue});
2203     DAG.ReplaceAllUsesWith(T, Repl);
2204     DAG.RemoveDeadNode(T.getNode());
2205 
2206     Chain = Repl.getValue(1);
2207     InGlue = Repl.getValue(2);
2208   }
2209 
2210   // set isTailCall to false for now, until we figure out how to express
2211   // tail call optimization in PTX
2212   isTailCall = false;
2213   return Chain;
2214 }
2215 
2216 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2217                                                      SelectionDAG &DAG) const {
2218 
2219   if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2220     const Function &Fn = DAG.getMachineFunction().getFunction();
2221 
2222     DiagnosticInfoUnsupported NoDynamicAlloca(
2223         Fn,
2224         "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2225         "requires target sm_52.",
2226         SDLoc(Op).getDebugLoc());
2227     DAG.getContext()->diagnose(NoDynamicAlloca);
2228     auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2229                 Op.getOperand(0)};
2230     return DAG.getMergeValues(Ops, SDLoc());
2231   }
2232 
2233   SDValue Chain = Op.getOperand(0);
2234   SDValue Size = Op.getOperand(1);
2235   uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2236   SDLoc DL(Op.getNode());
2237 
2238   // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2239   if (nvTM->is64Bit())
2240     Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2241   else
2242     Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2243 
2244   SDValue AllocOps[] = {Chain, Size,
2245                         DAG.getTargetConstant(Align, DL, MVT::i32)};
2246   SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
2247                                nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2248 
2249   SDValue MergeOps[] = {Alloca, Chain};
2250   return DAG.getMergeValues(MergeOps, DL);
2251 }
2252 
2253 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2254 // (see LegalizeDAG.cpp). This is slow and uses local memory.
2255 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2256 SDValue
2257 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2258   SDNode *Node = Op.getNode();
2259   SDLoc dl(Node);
2260   SmallVector<SDValue, 8> Ops;
2261   unsigned NumOperands = Node->getNumOperands();
2262   for (unsigned i = 0; i < NumOperands; ++i) {
2263     SDValue SubOp = Node->getOperand(i);
2264     EVT VVT = SubOp.getNode()->getValueType(0);
2265     EVT EltVT = VVT.getVectorElementType();
2266     unsigned NumSubElem = VVT.getVectorNumElements();
2267     for (unsigned j = 0; j < NumSubElem; ++j) {
2268       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2269                                 DAG.getIntPtrConstant(j, dl)));
2270     }
2271   }
2272   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2273 }
2274 
2275 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
2276 // would get lowered as two constant loads and vector-packing move.
2277 // Instead we want just a constant move:
2278 //        mov.b32         %r2, 0x40003C00
2279 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2280                                                SelectionDAG &DAG) const {
2281   EVT VT = Op->getValueType(0);
2282   if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2283     return Op;
2284 
2285   SDLoc DL(Op);
2286 
2287   if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2288         return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2289                isa<ConstantFPSDNode>(Operand);
2290       })) {
2291     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2292     // to optimize calculation of constant parts.
2293     if (VT == MVT::v4i8) {
2294       SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2295       SDValue E01 = DAG.getNode(
2296           NVPTXISD::BFI, DL, MVT::i32,
2297           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2298           DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2299       SDValue E012 =
2300           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2301                       DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2302                       E01, DAG.getConstant(16, DL, MVT::i32), C8);
2303       SDValue E0123 =
2304           DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2305                       DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2306                       E012, DAG.getConstant(24, DL, MVT::i32), C8);
2307       return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2308     }
2309     return Op;
2310   }
2311 
2312   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2313   auto GetOperand = [](SDValue Op, int N) -> APInt {
2314     const SDValue &Operand = Op->getOperand(N);
2315     EVT VT = Op->getValueType(0);
2316     if (Operand->isUndef())
2317       return APInt(32, 0);
2318     APInt Value;
2319     if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2320       Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2321     else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2322       Value = Operand->getAsAPIntVal();
2323     else
2324       llvm_unreachable("Unsupported type");
2325     // i8 values are carried around as i16, so we need to zero out upper bits,
2326     // so they do not get in the way of combining individual byte values
2327     if (VT == MVT::v4i8)
2328       Value = Value.trunc(8);
2329     return Value.zext(32);
2330   };
2331   APInt Value;
2332   if (Isv2x16VT(VT)) {
2333     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2334   } else if (VT == MVT::v4i8) {
2335     Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2336             GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2337   } else {
2338     llvm_unreachable("Unsupported type");
2339   }
2340   SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2341   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2342 }
2343 
2344 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2345                                                      SelectionDAG &DAG) const {
2346   SDValue Index = Op->getOperand(1);
2347   SDValue Vector = Op->getOperand(0);
2348   SDLoc DL(Op);
2349   EVT VectorVT = Vector.getValueType();
2350 
2351   if (VectorVT == MVT::v4i8) {
2352     SDValue BFE =
2353         DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2354                     {Vector,
2355                      DAG.getNode(ISD::MUL, DL, MVT::i32,
2356                                  DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2357                                  DAG.getConstant(8, DL, MVT::i32)),
2358                      DAG.getConstant(8, DL, MVT::i32)});
2359     return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2360   }
2361 
2362   // Constant index will be matched by tablegen.
2363   if (isa<ConstantSDNode>(Index.getNode()))
2364     return Op;
2365 
2366   // Extract individual elements and select one of them.
2367   assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2368   EVT EltVT = VectorVT.getVectorElementType();
2369 
2370   SDLoc dl(Op.getNode());
2371   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2372                            DAG.getIntPtrConstant(0, dl));
2373   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2374                            DAG.getIntPtrConstant(1, dl));
2375   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2376                          ISD::CondCode::SETEQ);
2377 }
2378 
2379 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2380                                                     SelectionDAG &DAG) const {
2381   SDValue Vector = Op->getOperand(0);
2382   EVT VectorVT = Vector.getValueType();
2383 
2384   if (VectorVT != MVT::v4i8)
2385     return Op;
2386   SDLoc DL(Op);
2387   SDValue Value = Op->getOperand(1);
2388   if (Value->isUndef())
2389     return Vector;
2390 
2391   SDValue Index = Op->getOperand(2);
2392 
2393   SDValue BFI =
2394       DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2395                   {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2396                    DAG.getNode(ISD::MUL, DL, MVT::i32,
2397                                DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2398                                DAG.getConstant(8, DL, MVT::i32)),
2399                    DAG.getConstant(8, DL, MVT::i32)});
2400   return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2401 }
2402 
2403 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2404                                                  SelectionDAG &DAG) const {
2405   SDValue V1 = Op.getOperand(0);
2406   EVT VectorVT = V1.getValueType();
2407   if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2408     return Op;
2409 
2410   // Lower shuffle to PRMT instruction.
2411   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2412   SDValue V2 = Op.getOperand(1);
2413   uint32_t Selector = 0;
2414   for (auto I : llvm::enumerate(SVN->getMask())) {
2415     if (I.value() != -1) // -1 is a placeholder for undef.
2416       Selector |= (I.value() << (I.index() * 4));
2417   }
2418 
2419   SDLoc DL(Op);
2420   return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2421                      DAG.getConstant(Selector, DL, MVT::i32),
2422                      DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2423 }
2424 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2425 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2426 ///    amount, or
2427 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2428 ///    amount.
2429 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2430                                                   SelectionDAG &DAG) const {
2431   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2432   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2433 
2434   EVT VT = Op.getValueType();
2435   unsigned VTBits = VT.getSizeInBits();
2436   SDLoc dl(Op);
2437   SDValue ShOpLo = Op.getOperand(0);
2438   SDValue ShOpHi = Op.getOperand(1);
2439   SDValue ShAmt  = Op.getOperand(2);
2440   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2441 
2442   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2443     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2444     // {dHi, dLo} = {aHi, aLo} >> Amt
2445     //   dHi = aHi >> Amt
2446     //   dLo = shf.r.clamp aLo, aHi, Amt
2447 
2448     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2449     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2450                              ShAmt);
2451 
2452     SDValue Ops[2] = { Lo, Hi };
2453     return DAG.getMergeValues(Ops, dl);
2454   }
2455   else {
2456     // {dHi, dLo} = {aHi, aLo} >> Amt
2457     // - if (Amt>=size) then
2458     //      dLo = aHi >> (Amt-size)
2459     //      dHi = aHi >> Amt (this is either all 0 or all 1)
2460     //   else
2461     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2462     //      dHi = aHi >> Amt
2463 
2464     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2465                                    DAG.getConstant(VTBits, dl, MVT::i32),
2466                                    ShAmt);
2467     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2468     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2469                                      DAG.getConstant(VTBits, dl, MVT::i32));
2470     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2471     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2472     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2473 
2474     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2475                                DAG.getConstant(VTBits, dl, MVT::i32),
2476                                ISD::SETGE);
2477     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2478     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2479 
2480     SDValue Ops[2] = { Lo, Hi };
2481     return DAG.getMergeValues(Ops, dl);
2482   }
2483 }
2484 
2485 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2486 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2487 ///    amount, or
2488 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2489 ///    amount.
2490 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2491                                                  SelectionDAG &DAG) const {
2492   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2493   assert(Op.getOpcode() == ISD::SHL_PARTS);
2494 
2495   EVT VT = Op.getValueType();
2496   unsigned VTBits = VT.getSizeInBits();
2497   SDLoc dl(Op);
2498   SDValue ShOpLo = Op.getOperand(0);
2499   SDValue ShOpHi = Op.getOperand(1);
2500   SDValue ShAmt  = Op.getOperand(2);
2501 
2502   if (VTBits == 32 && STI.getSmVersion() >= 35) {
2503     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2504     // {dHi, dLo} = {aHi, aLo} << Amt
2505     //   dHi = shf.l.clamp aLo, aHi, Amt
2506     //   dLo = aLo << Amt
2507 
2508     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2509                              ShAmt);
2510     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2511 
2512     SDValue Ops[2] = { Lo, Hi };
2513     return DAG.getMergeValues(Ops, dl);
2514   }
2515   else {
2516     // {dHi, dLo} = {aHi, aLo} << Amt
2517     // - if (Amt>=size) then
2518     //      dLo = aLo << Amt (all 0)
2519     //      dLo = aLo << (Amt-size)
2520     //   else
2521     //      dLo = aLo << Amt
2522     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2523 
2524     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2525                                    DAG.getConstant(VTBits, dl, MVT::i32),
2526                                    ShAmt);
2527     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2528     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2529                                      DAG.getConstant(VTBits, dl, MVT::i32));
2530     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2531     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2532     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2533 
2534     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2535                                DAG.getConstant(VTBits, dl, MVT::i32),
2536                                ISD::SETGE);
2537     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2538     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2539 
2540     SDValue Ops[2] = { Lo, Hi };
2541     return DAG.getMergeValues(Ops, dl);
2542   }
2543 }
2544 
2545 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2546   EVT VT = Op.getValueType();
2547 
2548   if (VT == MVT::f32)
2549     return LowerFROUND32(Op, DAG);
2550 
2551   if (VT == MVT::f64)
2552     return LowerFROUND64(Op, DAG);
2553 
2554   llvm_unreachable("unhandled type");
2555 }
2556 
2557 // This is the the rounding method used in CUDA libdevice in C like code:
2558 // float roundf(float A)
2559 // {
2560 //   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2561 //   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2562 //   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2563 // }
2564 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2565                                            SelectionDAG &DAG) const {
2566   SDLoc SL(Op);
2567   SDValue A = Op.getOperand(0);
2568   EVT VT = Op.getValueType();
2569 
2570   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2571 
2572   // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2573   SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2574   const int SignBitMask = 0x80000000;
2575   SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2576                              DAG.getConstant(SignBitMask, SL, MVT::i32));
2577   const int PointFiveInBits = 0x3F000000;
2578   SDValue PointFiveWithSignRaw =
2579       DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2580                   DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2581   SDValue PointFiveWithSign =
2582       DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2583   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2584   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2585 
2586   // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2587   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2588   SDValue IsLarge =
2589       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2590                    ISD::SETOGT);
2591   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2592 
2593   // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2594   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2595                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2596   SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2597   return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2598 }
2599 
2600 // The implementation of round(double) is similar to that of round(float) in
2601 // that they both separate the value range into three regions and use a method
2602 // specific to the region to round the values. However, round(double) first
2603 // calculates the round of the absolute value and then adds the sign back while
2604 // round(float) directly rounds the value with sign.
2605 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2606                                            SelectionDAG &DAG) const {
2607   SDLoc SL(Op);
2608   SDValue A = Op.getOperand(0);
2609   EVT VT = Op.getValueType();
2610 
2611   SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2612 
2613   // double RoundedA = (double) (int) (abs(A) + 0.5f);
2614   SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2615                                   DAG.getConstantFP(0.5, SL, VT));
2616   SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2617 
2618   // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2619   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2620   SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2621                                 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2622   RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2623                          DAG.getConstantFP(0, SL, VT),
2624                          RoundedA);
2625 
2626   // Add sign to rounded_A
2627   RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2628   DAG.getNode(ISD::FTRUNC, SL, VT, A);
2629 
2630   // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2631   SDValue IsLarge =
2632       DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2633                    ISD::SETOGT);
2634   return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2635 }
2636 
2637 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2638                                             SelectionDAG &DAG) const {
2639   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2640 
2641   if (Op.getValueType() == MVT::bf16) {
2642     SDLoc Loc(Op);
2643     return DAG.getNode(
2644         ISD::FP_ROUND, Loc, MVT::bf16,
2645         DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2646         DAG.getIntPtrConstant(0, Loc));
2647   }
2648 
2649   // Everything else is considered legal.
2650   return Op;
2651 }
2652 
2653 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2654                                             SelectionDAG &DAG) const {
2655   assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2656 
2657   if (Op.getOperand(0).getValueType() == MVT::bf16) {
2658     SDLoc Loc(Op);
2659     return DAG.getNode(
2660         Op.getOpcode(), Loc, Op.getValueType(),
2661         DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2662   }
2663 
2664   // Everything else is considered legal.
2665   return Op;
2666 }
2667 
2668 SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2669                                            SelectionDAG &DAG) const {
2670   EVT NarrowVT = Op.getValueType();
2671   SDValue Wide = Op.getOperand(0);
2672   EVT WideVT = Wide.getValueType();
2673   if (NarrowVT.getScalarType() == MVT::bf16) {
2674     const TargetLowering *TLI = STI.getTargetLowering();
2675     if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2676       return TLI->expandFP_ROUND(Op.getNode(), DAG);
2677     }
2678     if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2679       // This combination was the first to support f32 -> bf16.
2680       if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2681         if (WideVT.getScalarType() == MVT::f32) {
2682           return Op;
2683         }
2684         if (WideVT.getScalarType() == MVT::f64) {
2685           SDLoc Loc(Op);
2686           // Round-inexact-to-odd f64 to f32, then do the final rounding using
2687           // the hardware f32 -> bf16 instruction.
2688           SDValue rod = TLI->expandRoundInexactToOdd(
2689               WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2690                                 : MVT::f32,
2691               Wide, Loc, DAG);
2692           return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2693         }
2694       }
2695       return TLI->expandFP_ROUND(Op.getNode(), DAG);
2696     }
2697   }
2698 
2699   // Everything else is considered legal.
2700   return Op;
2701 }
2702 
2703 SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2704                                             SelectionDAG &DAG) const {
2705   SDValue Narrow = Op.getOperand(0);
2706   EVT NarrowVT = Narrow.getValueType();
2707   EVT WideVT = Op.getValueType();
2708   if (NarrowVT.getScalarType() == MVT::bf16) {
2709     if (WideVT.getScalarType() == MVT::f32 &&
2710         (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2711       SDLoc Loc(Op);
2712       return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2713     }
2714     if (WideVT.getScalarType() == MVT::f64 &&
2715         (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2716       EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2717                                     : MVT::f32;
2718       SDLoc Loc(Op);
2719       if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2720         Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2721       } else {
2722         Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2723       }
2724       return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2725     }
2726   }
2727 
2728   // Everything else is considered legal.
2729   return Op;
2730 }
2731 
2732 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2733   SDLoc DL(Op);
2734   if (Op.getValueType() != MVT::v2i16)
2735     return Op;
2736   EVT EltVT = Op.getValueType().getVectorElementType();
2737   SmallVector<SDValue> VecElements;
2738   for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2739     SmallVector<SDValue> ScalarArgs;
2740     llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2741                     [&](const SDUse &O) {
2742                       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2743                                          O.get(), DAG.getIntPtrConstant(I, DL));
2744                     });
2745     VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2746   }
2747   SDValue V =
2748       DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2749   return V;
2750 }
2751 
2752 SDValue
2753 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2754   switch (Op.getOpcode()) {
2755   case ISD::RETURNADDR:
2756     return SDValue();
2757   case ISD::FRAMEADDR:
2758     return SDValue();
2759   case ISD::GlobalAddress:
2760     return LowerGlobalAddress(Op, DAG);
2761   case ISD::INTRINSIC_W_CHAIN:
2762     return Op;
2763   case ISD::BUILD_VECTOR:
2764     return LowerBUILD_VECTOR(Op, DAG);
2765   case ISD::EXTRACT_SUBVECTOR:
2766     return Op;
2767   case ISD::EXTRACT_VECTOR_ELT:
2768     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2769   case ISD::INSERT_VECTOR_ELT:
2770     return LowerINSERT_VECTOR_ELT(Op, DAG);
2771   case ISD::VECTOR_SHUFFLE:
2772     return LowerVECTOR_SHUFFLE(Op, DAG);
2773   case ISD::CONCAT_VECTORS:
2774     return LowerCONCAT_VECTORS(Op, DAG);
2775   case ISD::STORE:
2776     return LowerSTORE(Op, DAG);
2777   case ISD::LOAD:
2778     return LowerLOAD(Op, DAG);
2779   case ISD::SHL_PARTS:
2780     return LowerShiftLeftParts(Op, DAG);
2781   case ISD::SRA_PARTS:
2782   case ISD::SRL_PARTS:
2783     return LowerShiftRightParts(Op, DAG);
2784   case ISD::SELECT:
2785     return LowerSelect(Op, DAG);
2786   case ISD::FROUND:
2787     return LowerFROUND(Op, DAG);
2788   case ISD::SINT_TO_FP:
2789   case ISD::UINT_TO_FP:
2790     return LowerINT_TO_FP(Op, DAG);
2791   case ISD::FP_TO_SINT:
2792   case ISD::FP_TO_UINT:
2793     return LowerFP_TO_INT(Op, DAG);
2794   case ISD::FP_ROUND:
2795     return LowerFP_ROUND(Op, DAG);
2796   case ISD::FP_EXTEND:
2797     return LowerFP_EXTEND(Op, DAG);
2798   case ISD::VAARG:
2799     return LowerVAARG(Op, DAG);
2800   case ISD::VASTART:
2801     return LowerVASTART(Op, DAG);
2802   case ISD::ABS:
2803   case ISD::SMIN:
2804   case ISD::SMAX:
2805   case ISD::UMIN:
2806   case ISD::UMAX:
2807   case ISD::ADD:
2808   case ISD::SUB:
2809   case ISD::MUL:
2810   case ISD::SHL:
2811   case ISD::SREM:
2812   case ISD::UREM:
2813     return LowerVectorArith(Op, DAG);
2814   case ISD::DYNAMIC_STACKALLOC:
2815     return LowerDYNAMIC_STACKALLOC(Op, DAG);
2816   case ISD::CopyToReg:
2817     return LowerCopyToReg_128(Op, DAG);
2818   default:
2819     llvm_unreachable("Custom lowering not defined for operation");
2820   }
2821 }
2822 
2823 // This function is almost a copy of SelectionDAG::expandVAArg().
2824 // The only diff is that this one produces loads from local address space.
2825 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2826   const TargetLowering *TLI = STI.getTargetLowering();
2827   SDLoc DL(Op);
2828 
2829   SDNode *Node = Op.getNode();
2830   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2831   EVT VT = Node->getValueType(0);
2832   auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2833   SDValue Tmp1 = Node->getOperand(0);
2834   SDValue Tmp2 = Node->getOperand(1);
2835   const MaybeAlign MA(Node->getConstantOperandVal(3));
2836 
2837   SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2838                                    Tmp1, Tmp2, MachinePointerInfo(V));
2839   SDValue VAList = VAListLoad;
2840 
2841   if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2842     VAList = DAG.getNode(
2843         ISD::ADD, DL, VAList.getValueType(), VAList,
2844         DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2845 
2846     VAList = DAG.getNode(
2847         ISD::AND, DL, VAList.getValueType(), VAList,
2848         DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2849   }
2850 
2851   // Increment the pointer, VAList, to the next vaarg
2852   Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2853                      DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2854                                      DL, VAList.getValueType()));
2855 
2856   // Store the incremented VAList to the legalized pointer
2857   Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2858                       MachinePointerInfo(V));
2859 
2860   const Value *SrcV =
2861       Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2862 
2863   // Load the actual argument out of the pointer VAList
2864   return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2865 }
2866 
2867 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2868   const TargetLowering *TLI = STI.getTargetLowering();
2869   SDLoc DL(Op);
2870   EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2871 
2872   // Store the address of unsized array <function>_vararg[] in the ap object.
2873   SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2874   SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2875 
2876   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2877   return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2878                       MachinePointerInfo(SV));
2879 }
2880 
2881 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2882   SDValue Op0 = Op->getOperand(0);
2883   SDValue Op1 = Op->getOperand(1);
2884   SDValue Op2 = Op->getOperand(2);
2885   SDLoc DL(Op.getNode());
2886 
2887   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2888 
2889   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2890   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2891   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2892   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2893 
2894   return Trunc;
2895 }
2896 
2897 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2898   if (Op.getValueType() == MVT::i1)
2899     return LowerLOADi1(Op, DAG);
2900 
2901   // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2902   // unaligned loads and have to handle it here.
2903   EVT VT = Op.getValueType();
2904   if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2905     LoadSDNode *Load = cast<LoadSDNode>(Op);
2906     EVT MemVT = Load->getMemoryVT();
2907     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2908                                         MemVT, *Load->getMemOperand())) {
2909       SDValue Ops[2];
2910       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2911       return DAG.getMergeValues(Ops, SDLoc(Op));
2912     }
2913   }
2914 
2915   return SDValue();
2916 }
2917 
2918 // v = ld i1* addr
2919 //   =>
2920 // v1 = ld i8* addr (-> i16)
2921 // v = trunc i16 to i1
2922 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2923   SDNode *Node = Op.getNode();
2924   LoadSDNode *LD = cast<LoadSDNode>(Node);
2925   SDLoc dl(Node);
2926   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2927   assert(Node->getValueType(0) == MVT::i1 &&
2928          "Custom lowering for i1 load only");
2929   SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2930                                  LD->getBasePtr(), LD->getPointerInfo(),
2931                                  MVT::i8, LD->getAlign(),
2932                                  LD->getMemOperand()->getFlags());
2933   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2934   // The legalizer (the caller) is expecting two values from the legalized
2935   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2936   // in LegalizeDAG.cpp which also uses MergeValues.
2937   SDValue Ops[] = { result, LD->getChain() };
2938   return DAG.getMergeValues(Ops, dl);
2939 }
2940 
2941 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2942   StoreSDNode *Store = cast<StoreSDNode>(Op);
2943   EVT VT = Store->getMemoryVT();
2944 
2945   if (VT == MVT::i1)
2946     return LowerSTOREi1(Op, DAG);
2947 
2948   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2949   // stores and have to handle it here.
2950   if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2951       !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2952                                       VT, *Store->getMemOperand()))
2953     return expandUnalignedStore(Store, DAG);
2954 
2955   // v2f16, v2bf16 and v2i16 don't need special handling.
2956   if (Isv2x16VT(VT) || VT == MVT::v4i8)
2957     return SDValue();
2958 
2959   if (VT.isVector())
2960     return LowerSTOREVector(Op, DAG);
2961 
2962   return SDValue();
2963 }
2964 
2965 SDValue
2966 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2967   SDNode *N = Op.getNode();
2968   SDValue Val = N->getOperand(1);
2969   SDLoc DL(N);
2970   EVT ValVT = Val.getValueType();
2971 
2972   if (ValVT.isVector()) {
2973     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2974     // legal.  We can (and should) split that into 2 stores of <2 x double> here
2975     // but I'm leaving that as a TODO for now.
2976     if (!ValVT.isSimple())
2977       return SDValue();
2978     switch (ValVT.getSimpleVT().SimpleTy) {
2979     default:
2980       return SDValue();
2981     case MVT::v2i8:
2982     case MVT::v2i16:
2983     case MVT::v2i32:
2984     case MVT::v2i64:
2985     case MVT::v2f16:
2986     case MVT::v2bf16:
2987     case MVT::v2f32:
2988     case MVT::v2f64:
2989     case MVT::v4i8:
2990     case MVT::v4i16:
2991     case MVT::v4i32:
2992     case MVT::v4f16:
2993     case MVT::v4bf16:
2994     case MVT::v4f32:
2995     case MVT::v8f16: // <4 x f16x2>
2996     case MVT::v8bf16: // <4 x bf16x2>
2997     case MVT::v8i16:  // <4 x i16x2>
2998       // This is a "native" vector type
2999       break;
3000     }
3001 
3002     MemSDNode *MemSD = cast<MemSDNode>(N);
3003     const DataLayout &TD = DAG.getDataLayout();
3004 
3005     Align Alignment = MemSD->getAlign();
3006     Align PrefAlign =
3007         TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3008     if (Alignment < PrefAlign) {
3009       // This store is not sufficiently aligned, so bail out and let this vector
3010       // store be scalarized.  Note that we may still be able to emit smaller
3011       // vector stores.  For example, if we are storing a <4 x float> with an
3012       // alignment of 8, this check will fail but the legalizer will try again
3013       // with 2 x <2 x float>, which will succeed with an alignment of 8.
3014       return SDValue();
3015     }
3016 
3017     unsigned Opcode = 0;
3018     EVT EltVT = ValVT.getVectorElementType();
3019     unsigned NumElts = ValVT.getVectorNumElements();
3020 
3021     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3022     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
3023     // stored type to i16 and propagate the "real" type as the memory type.
3024     bool NeedExt = false;
3025     if (EltVT.getSizeInBits() < 16)
3026       NeedExt = true;
3027 
3028     bool StoreF16x2 = false;
3029     switch (NumElts) {
3030     default:
3031       return SDValue();
3032     case 2:
3033       Opcode = NVPTXISD::StoreV2;
3034       break;
3035     case 4:
3036       Opcode = NVPTXISD::StoreV4;
3037       break;
3038     case 8:
3039       // v8f16 is a special case. PTX doesn't have st.v8.f16
3040       // instruction. Instead, we split the vector into v2f16 chunks and
3041       // store them with st.v4.b32.
3042       assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3043       Opcode = NVPTXISD::StoreV4;
3044       StoreF16x2 = true;
3045       break;
3046     }
3047 
3048     SmallVector<SDValue, 8> Ops;
3049 
3050     // First is the chain
3051     Ops.push_back(N->getOperand(0));
3052 
3053     if (StoreF16x2) {
3054       // Combine f16,f16 -> v2f16
3055       NumElts /= 2;
3056       for (unsigned i = 0; i < NumElts; ++i) {
3057         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3058                                  DAG.getIntPtrConstant(i * 2, DL));
3059         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3060                                  DAG.getIntPtrConstant(i * 2 + 1, DL));
3061         EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3062         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3063         Ops.push_back(V2);
3064       }
3065     } else {
3066       // Then the split values
3067       for (unsigned i = 0; i < NumElts; ++i) {
3068         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3069                                      DAG.getIntPtrConstant(i, DL));
3070         if (NeedExt)
3071           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3072         Ops.push_back(ExtVal);
3073       }
3074     }
3075 
3076     // Then any remaining arguments
3077     Ops.append(N->op_begin() + 2, N->op_end());
3078 
3079     SDValue NewSt =
3080         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3081                                 MemSD->getMemoryVT(), MemSD->getMemOperand());
3082 
3083     // return DCI.CombineTo(N, NewSt, true);
3084     return NewSt;
3085   }
3086 
3087   return SDValue();
3088 }
3089 
3090 // st i1 v, addr
3091 //    =>
3092 // v1 = zxt v to i16
3093 // st.u8 i16, addr
3094 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3095   SDNode *Node = Op.getNode();
3096   SDLoc dl(Node);
3097   StoreSDNode *ST = cast<StoreSDNode>(Node);
3098   SDValue Tmp1 = ST->getChain();
3099   SDValue Tmp2 = ST->getBasePtr();
3100   SDValue Tmp3 = ST->getValue();
3101   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3102   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3103   SDValue Result =
3104       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3105                         ST->getAlign(), ST->getMemOperand()->getFlags());
3106   return Result;
3107 }
3108 
3109 SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3110                                                 SelectionDAG &DAG) const {
3111   // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3112   // operand so that it can pass the legalization.
3113 
3114   assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3115          "Custom lowering for 128-bit CopyToReg only");
3116 
3117   SDNode *Node = Op.getNode();
3118   SDLoc DL(Node);
3119 
3120   SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3121   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3122                            DAG.getIntPtrConstant(0, DL));
3123   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3124                            DAG.getIntPtrConstant(1, DL));
3125 
3126   SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);
3127   SmallVector<EVT, 3> ResultsType(Node->values());
3128 
3129   NewOps[0] = Op->getOperand(0); // Chain
3130   NewOps[1] = Op->getOperand(1); // Dst Reg
3131   NewOps[2] = Lo;                // Lower 64-bit
3132   NewOps[3] = Hi;                // Higher 64-bit
3133   if (Op.getNumOperands() == 4)
3134     NewOps[4] = Op->getOperand(3); // Glue if exists
3135 
3136   return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3137 }
3138 
3139 unsigned NVPTXTargetLowering::getNumRegisters(
3140     LLVMContext &Context, EVT VT,
3141     std::optional<MVT> RegisterVT = std::nullopt) const {
3142   if (VT == MVT::i128 && RegisterVT == MVT::i128)
3143     return 1;
3144   return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3145 }
3146 
3147 bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3148     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3149     unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3150   if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3151     Parts[0] = Val;
3152     return true;
3153   }
3154   return false;
3155 }
3156 
3157 // This creates target external symbol for a function parameter.
3158 // Name of the symbol is composed from its index and the function name.
3159 // Negative index corresponds to special parameter (unsized array) used for
3160 // passing variable arguments.
3161 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3162                                             EVT v) const {
3163   StringRef SavedStr = nvTM->getStrPool().save(
3164       getParamName(&DAG.getMachineFunction().getFunction(), idx));
3165   return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3166 }
3167 
3168 SDValue NVPTXTargetLowering::LowerFormalArguments(
3169     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3170     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3171     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3172   MachineFunction &MF = DAG.getMachineFunction();
3173   const DataLayout &DL = DAG.getDataLayout();
3174   auto PtrVT = getPointerTy(DAG.getDataLayout());
3175 
3176   const Function *F = &MF.getFunction();
3177   const AttributeList &PAL = F->getAttributes();
3178   const TargetLowering *TLI = STI.getTargetLowering();
3179 
3180   SDValue Root = DAG.getRoot();
3181   std::vector<SDValue> OutChains;
3182 
3183   bool isABI = (STI.getSmVersion() >= 20);
3184   assert(isABI && "Non-ABI compilation is not supported");
3185   if (!isABI)
3186     return Chain;
3187 
3188   std::vector<Type *> argTypes;
3189   std::vector<const Argument *> theArgs;
3190   for (const Argument &I : F->args()) {
3191     theArgs.push_back(&I);
3192     argTypes.push_back(I.getType());
3193   }
3194   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3195   // Ins.size() will be larger
3196   //   * if there is an aggregate argument with multiple fields (each field
3197   //     showing up separately in Ins)
3198   //   * if there is a vector argument with more than typical vector-length
3199   //     elements (generally if more than 4) where each vector element is
3200   //     individually present in Ins.
3201   // So a different index should be used for indexing into Ins.
3202   // See similar issue in LowerCall.
3203   unsigned InsIdx = 0;
3204 
3205   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3206     Type *Ty = argTypes[i];
3207 
3208     if (theArgs[i]->use_empty()) {
3209       // argument is dead
3210       if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3211         SmallVector<EVT, 16> vtparts;
3212 
3213         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3214         if (vtparts.empty())
3215           report_fatal_error("Empty parameter types are not supported");
3216 
3217         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3218              ++parti) {
3219           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3220           ++InsIdx;
3221         }
3222         if (vtparts.size() > 0)
3223           --InsIdx;
3224         continue;
3225       }
3226       if (Ty->isVectorTy()) {
3227         EVT ObjectVT = getValueType(DL, Ty);
3228         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3229         for (unsigned parti = 0; parti < NumRegs; ++parti) {
3230           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3231           ++InsIdx;
3232         }
3233         if (NumRegs > 0)
3234           --InsIdx;
3235         continue;
3236       }
3237       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3238       continue;
3239     }
3240 
3241     // In the following cases, assign a node order of "i+1"
3242     // to newly created nodes. The SDNodes for params have to
3243     // appear in the same order as their order of appearance
3244     // in the original function. "i+1" holds that order.
3245     if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3246       bool aggregateIsPacked = false;
3247       if (StructType *STy = dyn_cast<StructType>(Ty))
3248         aggregateIsPacked = STy->isPacked();
3249 
3250       SmallVector<EVT, 16> VTs;
3251       SmallVector<uint64_t, 16> Offsets;
3252       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3253       if (VTs.empty())
3254         report_fatal_error("Empty parameter types are not supported");
3255 
3256       Align ArgAlign = getFunctionArgumentAlignment(
3257           F, Ty, i + AttributeList::FirstArgIndex, DL);
3258       auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3259 
3260       SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3261       int VecIdx = -1; // Index of the first element of the current vector.
3262       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3263         if (VectorInfo[parti] & PVF_FIRST) {
3264           assert(VecIdx == -1 && "Orphaned vector.");
3265           VecIdx = parti;
3266         }
3267 
3268         // That's the last element of this store op.
3269         if (VectorInfo[parti] & PVF_LAST) {
3270           unsigned NumElts = parti - VecIdx + 1;
3271           EVT EltVT = VTs[parti];
3272           // i1 is loaded/stored as i8.
3273           EVT LoadVT = EltVT;
3274           if (EltVT == MVT::i1)
3275             LoadVT = MVT::i8;
3276           else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3277             // getLoad needs a vector type, but it can't handle
3278             // vectors which contain v2f16 or v2bf16 elements. So we must load
3279             // using i32 here and then bitcast back.
3280             LoadVT = MVT::i32;
3281 
3282           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3283           SDValue VecAddr =
3284               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3285                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3286           Value *srcValue = Constant::getNullValue(PointerType::get(
3287               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3288 
3289           const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3290             if (aggregateIsPacked)
3291               return Align(1);
3292             if (NumElts != 1)
3293               return std::nullopt;
3294             Align PartAlign =
3295                 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3296             return commonAlignment(PartAlign, Offsets[parti]);
3297           }();
3298           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3299                                   MachinePointerInfo(srcValue), PartAlign,
3300                                   MachineMemOperand::MODereferenceable |
3301                                       MachineMemOperand::MOInvariant);
3302           if (P.getNode())
3303             P.getNode()->setIROrder(i + 1);
3304           for (unsigned j = 0; j < NumElts; ++j) {
3305             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3306                                       DAG.getIntPtrConstant(j, dl));
3307             // We've loaded i1 as an i8 and now must truncate it back to i1
3308             if (EltVT == MVT::i1)
3309               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3310             // v2f16 was loaded as an i32. Now we must bitcast it back.
3311             else if (EltVT != LoadVT)
3312               Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3313 
3314             // If a promoted integer type is used, truncate down to the original
3315             MVT PromotedVT;
3316             if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3317               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3318             }
3319 
3320             // Extend the element if necessary (e.g. an i8 is loaded
3321             // into an i16 register)
3322             if (Ins[InsIdx].VT.isInteger() &&
3323                 Ins[InsIdx].VT.getFixedSizeInBits() >
3324                     LoadVT.getFixedSizeInBits()) {
3325               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3326                                                            : ISD::ZERO_EXTEND;
3327               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3328             }
3329             InVals.push_back(Elt);
3330           }
3331 
3332           // Reset vector tracking state.
3333           VecIdx = -1;
3334         }
3335         ++InsIdx;
3336       }
3337       if (VTs.size() > 0)
3338         --InsIdx;
3339       continue;
3340     }
3341 
3342     // Param has ByVal attribute
3343     // Return MoveParam(param symbol).
3344     // Ideally, the param symbol can be returned directly,
3345     // but when SDNode builder decides to use it in a CopyToReg(),
3346     // machine instruction fails because TargetExternalSymbol
3347     // (not lowered) is target dependent, and CopyToReg assumes
3348     // the source is lowered.
3349     EVT ObjectVT = getValueType(DL, Ty);
3350     assert(ObjectVT == Ins[InsIdx].VT &&
3351            "Ins type did not match function type");
3352     SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3353     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3354     if (p.getNode())
3355       p.getNode()->setIROrder(i + 1);
3356     InVals.push_back(p);
3357   }
3358 
3359   if (!OutChains.empty())
3360     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3361 
3362   return Chain;
3363 }
3364 
3365 // Use byte-store when the param adress of the return value is unaligned.
3366 // This may happen when the return value is a field of a packed structure.
3367 static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
3368                                       uint64_t Offset, EVT ElementType,
3369                                       SDValue RetVal, const SDLoc &dl) {
3370   // Bit logic only works on integer types
3371   if (adjustElementType(ElementType))
3372     RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3373 
3374   // Store each byte
3375   for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3376     // Shift the byte to the last byte position
3377     SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3378                                    DAG.getConstant(i * 8, dl, MVT::i32));
3379     SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3380                                ShiftVal};
3381     // Trunc store only the last byte by using
3382     //     st.param.b8
3383     // The register type can be larger than b8.
3384     Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
3385                                     DAG.getVTList(MVT::Other), StoreOperands,
3386                                     MVT::i8, MachinePointerInfo(), std::nullopt,
3387                                     MachineMemOperand::MOStore);
3388   }
3389   return Chain;
3390 }
3391 
3392 SDValue
3393 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3394                                  bool isVarArg,
3395                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
3396                                  const SmallVectorImpl<SDValue> &OutVals,
3397                                  const SDLoc &dl, SelectionDAG &DAG) const {
3398   const MachineFunction &MF = DAG.getMachineFunction();
3399   const Function &F = MF.getFunction();
3400   Type *RetTy = MF.getFunction().getReturnType();
3401 
3402   bool isABI = (STI.getSmVersion() >= 20);
3403   assert(isABI && "Non-ABI compilation is not supported");
3404   if (!isABI)
3405     return Chain;
3406 
3407   const DataLayout &DL = DAG.getDataLayout();
3408   SmallVector<SDValue, 16> PromotedOutVals;
3409   SmallVector<EVT, 16> VTs;
3410   SmallVector<uint64_t, 16> Offsets;
3411   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3412   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3413 
3414   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3415     SDValue PromotedOutVal = OutVals[i];
3416     MVT PromotedVT;
3417     if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3418       VTs[i] = EVT(PromotedVT);
3419     }
3420     if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3421       llvm::ISD::NodeType Ext =
3422           Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3423       PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3424     }
3425     PromotedOutVals.push_back(PromotedOutVal);
3426   }
3427 
3428   auto VectorInfo = VectorizePTXValueVTs(
3429       VTs, Offsets,
3430       RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3431                        : Align(1));
3432 
3433   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3434   // 32-bits are sign extended or zero extended, depending on whether
3435   // they are signed or unsigned types.
3436   bool ExtendIntegerRetVal =
3437       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3438 
3439   SmallVector<SDValue, 6> StoreOperands;
3440   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3441     SDValue OutVal = OutVals[i];
3442     SDValue RetVal = PromotedOutVals[i];
3443 
3444     if (ExtendIntegerRetVal) {
3445       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3446                                                   : ISD::ZERO_EXTEND,
3447                            dl, MVT::i32, RetVal);
3448     } else if (OutVal.getValueSizeInBits() < 16) {
3449       // Use 16-bit registers for small load-stores as it's the
3450       // smallest general purpose register size supported by NVPTX.
3451       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3452     }
3453 
3454     // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3455     // for a scalar store. In such cases, fall back to byte stores.
3456     if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3457       EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3458       Align ElementTypeAlign =
3459           DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3460       Align ElementAlign =
3461           commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3462       if (ElementAlign < ElementTypeAlign) {
3463         assert(StoreOperands.empty() && "Orphaned operand list.");
3464         Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3465                                        RetVal, dl);
3466 
3467         // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3468         // into the graph, so just move on to the next element.
3469         continue;
3470       }
3471     }
3472 
3473     // New load/store. Record chain and offset operands.
3474     if (VectorInfo[i] & PVF_FIRST) {
3475       assert(StoreOperands.empty() && "Orphaned operand list.");
3476       StoreOperands.push_back(Chain);
3477       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3478     }
3479 
3480     // Record the value to return.
3481     StoreOperands.push_back(RetVal);
3482 
3483     // That's the last element of this store op.
3484     if (VectorInfo[i] & PVF_LAST) {
3485       NVPTXISD::NodeType Op;
3486       unsigned NumElts = StoreOperands.size() - 2;
3487       switch (NumElts) {
3488       case 1:
3489         Op = NVPTXISD::StoreRetval;
3490         break;
3491       case 2:
3492         Op = NVPTXISD::StoreRetvalV2;
3493         break;
3494       case 4:
3495         Op = NVPTXISD::StoreRetvalV4;
3496         break;
3497       default:
3498         llvm_unreachable("Invalid vector info.");
3499       }
3500 
3501       // Adjust type of load/store op if we've extended the scalar
3502       // return value.
3503       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3504       Chain = DAG.getMemIntrinsicNode(
3505           Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3506           MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3507       // Cleanup vector state.
3508       StoreOperands.clear();
3509     }
3510   }
3511 
3512   return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3513 }
3514 
3515 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3516     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3517     SelectionDAG &DAG) const {
3518   if (Constraint.size() > 1)
3519     return;
3520   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3521 }
3522 
3523 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3524   switch (Intrinsic) {
3525   default:
3526     return 0;
3527 
3528   case Intrinsic::nvvm_tex_1d_v4f32_s32:
3529     return NVPTXISD::Tex1DFloatS32;
3530   case Intrinsic::nvvm_tex_1d_v4f32_f32:
3531     return NVPTXISD::Tex1DFloatFloat;
3532   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3533     return NVPTXISD::Tex1DFloatFloatLevel;
3534   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3535     return NVPTXISD::Tex1DFloatFloatGrad;
3536   case Intrinsic::nvvm_tex_1d_v4s32_s32:
3537     return NVPTXISD::Tex1DS32S32;
3538   case Intrinsic::nvvm_tex_1d_v4s32_f32:
3539     return NVPTXISD::Tex1DS32Float;
3540   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3541     return NVPTXISD::Tex1DS32FloatLevel;
3542   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3543     return NVPTXISD::Tex1DS32FloatGrad;
3544   case Intrinsic::nvvm_tex_1d_v4u32_s32:
3545     return NVPTXISD::Tex1DU32S32;
3546   case Intrinsic::nvvm_tex_1d_v4u32_f32:
3547     return NVPTXISD::Tex1DU32Float;
3548   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3549     return NVPTXISD::Tex1DU32FloatLevel;
3550   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3551     return NVPTXISD::Tex1DU32FloatGrad;
3552 
3553   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3554     return NVPTXISD::Tex1DArrayFloatS32;
3555   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3556     return NVPTXISD::Tex1DArrayFloatFloat;
3557   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3558     return NVPTXISD::Tex1DArrayFloatFloatLevel;
3559   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3560     return NVPTXISD::Tex1DArrayFloatFloatGrad;
3561   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3562     return NVPTXISD::Tex1DArrayS32S32;
3563   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3564     return NVPTXISD::Tex1DArrayS32Float;
3565   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3566     return NVPTXISD::Tex1DArrayS32FloatLevel;
3567   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3568     return NVPTXISD::Tex1DArrayS32FloatGrad;
3569   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3570     return NVPTXISD::Tex1DArrayU32S32;
3571   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3572     return NVPTXISD::Tex1DArrayU32Float;
3573   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3574     return NVPTXISD::Tex1DArrayU32FloatLevel;
3575   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3576     return NVPTXISD::Tex1DArrayU32FloatGrad;
3577 
3578   case Intrinsic::nvvm_tex_2d_v4f32_s32:
3579     return NVPTXISD::Tex2DFloatS32;
3580   case Intrinsic::nvvm_tex_2d_v4f32_f32:
3581     return NVPTXISD::Tex2DFloatFloat;
3582   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3583     return NVPTXISD::Tex2DFloatFloatLevel;
3584   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3585     return NVPTXISD::Tex2DFloatFloatGrad;
3586   case Intrinsic::nvvm_tex_2d_v4s32_s32:
3587     return NVPTXISD::Tex2DS32S32;
3588   case Intrinsic::nvvm_tex_2d_v4s32_f32:
3589     return NVPTXISD::Tex2DS32Float;
3590   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3591     return NVPTXISD::Tex2DS32FloatLevel;
3592   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3593     return NVPTXISD::Tex2DS32FloatGrad;
3594   case Intrinsic::nvvm_tex_2d_v4u32_s32:
3595     return NVPTXISD::Tex2DU32S32;
3596   case Intrinsic::nvvm_tex_2d_v4u32_f32:
3597     return NVPTXISD::Tex2DU32Float;
3598   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3599     return NVPTXISD::Tex2DU32FloatLevel;
3600   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3601     return NVPTXISD::Tex2DU32FloatGrad;
3602 
3603   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3604     return NVPTXISD::Tex2DArrayFloatS32;
3605   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3606     return NVPTXISD::Tex2DArrayFloatFloat;
3607   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3608     return NVPTXISD::Tex2DArrayFloatFloatLevel;
3609   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3610     return NVPTXISD::Tex2DArrayFloatFloatGrad;
3611   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3612     return NVPTXISD::Tex2DArrayS32S32;
3613   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3614     return NVPTXISD::Tex2DArrayS32Float;
3615   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3616     return NVPTXISD::Tex2DArrayS32FloatLevel;
3617   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3618     return NVPTXISD::Tex2DArrayS32FloatGrad;
3619   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3620     return NVPTXISD::Tex2DArrayU32S32;
3621   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3622     return NVPTXISD::Tex2DArrayU32Float;
3623   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3624     return NVPTXISD::Tex2DArrayU32FloatLevel;
3625   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3626     return NVPTXISD::Tex2DArrayU32FloatGrad;
3627 
3628   case Intrinsic::nvvm_tex_3d_v4f32_s32:
3629     return NVPTXISD::Tex3DFloatS32;
3630   case Intrinsic::nvvm_tex_3d_v4f32_f32:
3631     return NVPTXISD::Tex3DFloatFloat;
3632   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3633     return NVPTXISD::Tex3DFloatFloatLevel;
3634   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3635     return NVPTXISD::Tex3DFloatFloatGrad;
3636   case Intrinsic::nvvm_tex_3d_v4s32_s32:
3637     return NVPTXISD::Tex3DS32S32;
3638   case Intrinsic::nvvm_tex_3d_v4s32_f32:
3639     return NVPTXISD::Tex3DS32Float;
3640   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3641     return NVPTXISD::Tex3DS32FloatLevel;
3642   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3643     return NVPTXISD::Tex3DS32FloatGrad;
3644   case Intrinsic::nvvm_tex_3d_v4u32_s32:
3645     return NVPTXISD::Tex3DU32S32;
3646   case Intrinsic::nvvm_tex_3d_v4u32_f32:
3647     return NVPTXISD::Tex3DU32Float;
3648   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3649     return NVPTXISD::Tex3DU32FloatLevel;
3650   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3651     return NVPTXISD::Tex3DU32FloatGrad;
3652 
3653   case Intrinsic::nvvm_tex_cube_v4f32_f32:
3654     return NVPTXISD::TexCubeFloatFloat;
3655   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3656     return NVPTXISD::TexCubeFloatFloatLevel;
3657   case Intrinsic::nvvm_tex_cube_v4s32_f32:
3658     return NVPTXISD::TexCubeS32Float;
3659   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3660     return NVPTXISD::TexCubeS32FloatLevel;
3661   case Intrinsic::nvvm_tex_cube_v4u32_f32:
3662     return NVPTXISD::TexCubeU32Float;
3663   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3664     return NVPTXISD::TexCubeU32FloatLevel;
3665 
3666   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3667     return NVPTXISD::TexCubeArrayFloatFloat;
3668   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3669     return NVPTXISD::TexCubeArrayFloatFloatLevel;
3670   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3671     return NVPTXISD::TexCubeArrayS32Float;
3672   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3673     return NVPTXISD::TexCubeArrayS32FloatLevel;
3674   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3675     return NVPTXISD::TexCubeArrayU32Float;
3676   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3677     return NVPTXISD::TexCubeArrayU32FloatLevel;
3678 
3679   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3680     return NVPTXISD::Tld4R2DFloatFloat;
3681   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3682     return NVPTXISD::Tld4G2DFloatFloat;
3683   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3684     return NVPTXISD::Tld4B2DFloatFloat;
3685   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3686     return NVPTXISD::Tld4A2DFloatFloat;
3687   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3688     return NVPTXISD::Tld4R2DS64Float;
3689   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3690     return NVPTXISD::Tld4G2DS64Float;
3691   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3692     return NVPTXISD::Tld4B2DS64Float;
3693   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3694     return NVPTXISD::Tld4A2DS64Float;
3695   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3696     return NVPTXISD::Tld4R2DU64Float;
3697   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3698     return NVPTXISD::Tld4G2DU64Float;
3699   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3700     return NVPTXISD::Tld4B2DU64Float;
3701   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3702     return NVPTXISD::Tld4A2DU64Float;
3703 
3704   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3705     return NVPTXISD::TexUnified1DFloatS32;
3706   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3707     return NVPTXISD::TexUnified1DFloatFloat;
3708   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3709     return NVPTXISD::TexUnified1DFloatFloatLevel;
3710   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3711     return NVPTXISD::TexUnified1DFloatFloatGrad;
3712   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3713     return NVPTXISD::TexUnified1DS32S32;
3714   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3715     return NVPTXISD::TexUnified1DS32Float;
3716   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3717     return NVPTXISD::TexUnified1DS32FloatLevel;
3718   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3719     return NVPTXISD::TexUnified1DS32FloatGrad;
3720   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3721     return NVPTXISD::TexUnified1DU32S32;
3722   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3723     return NVPTXISD::TexUnified1DU32Float;
3724   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3725     return NVPTXISD::TexUnified1DU32FloatLevel;
3726   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3727     return NVPTXISD::TexUnified1DU32FloatGrad;
3728 
3729   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3730     return NVPTXISD::TexUnified1DArrayFloatS32;
3731   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3732     return NVPTXISD::TexUnified1DArrayFloatFloat;
3733   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3734     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3735   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3736     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3737   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3738     return NVPTXISD::TexUnified1DArrayS32S32;
3739   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3740     return NVPTXISD::TexUnified1DArrayS32Float;
3741   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3742     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3743   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3744     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3745   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3746     return NVPTXISD::TexUnified1DArrayU32S32;
3747   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3748     return NVPTXISD::TexUnified1DArrayU32Float;
3749   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3750     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3751   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3752     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3753 
3754   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3755     return NVPTXISD::TexUnified2DFloatS32;
3756   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3757     return NVPTXISD::TexUnified2DFloatFloat;
3758   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3759     return NVPTXISD::TexUnified2DFloatFloatLevel;
3760   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3761     return NVPTXISD::TexUnified2DFloatFloatGrad;
3762   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3763     return NVPTXISD::TexUnified2DS32S32;
3764   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3765     return NVPTXISD::TexUnified2DS32Float;
3766   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3767     return NVPTXISD::TexUnified2DS32FloatLevel;
3768   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3769     return NVPTXISD::TexUnified2DS32FloatGrad;
3770   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3771     return NVPTXISD::TexUnified2DU32S32;
3772   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3773     return NVPTXISD::TexUnified2DU32Float;
3774   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3775     return NVPTXISD::TexUnified2DU32FloatLevel;
3776   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3777     return NVPTXISD::TexUnified2DU32FloatGrad;
3778 
3779   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3780     return NVPTXISD::TexUnified2DArrayFloatS32;
3781   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3782     return NVPTXISD::TexUnified2DArrayFloatFloat;
3783   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3784     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3785   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3786     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3787   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3788     return NVPTXISD::TexUnified2DArrayS32S32;
3789   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3790     return NVPTXISD::TexUnified2DArrayS32Float;
3791   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3792     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3793   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3794     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3795   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3796     return NVPTXISD::TexUnified2DArrayU32S32;
3797   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3798     return NVPTXISD::TexUnified2DArrayU32Float;
3799   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3800     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3801   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3802     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3803 
3804   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3805     return NVPTXISD::TexUnified3DFloatS32;
3806   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3807     return NVPTXISD::TexUnified3DFloatFloat;
3808   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3809     return NVPTXISD::TexUnified3DFloatFloatLevel;
3810   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3811     return NVPTXISD::TexUnified3DFloatFloatGrad;
3812   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3813     return NVPTXISD::TexUnified3DS32S32;
3814   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3815     return NVPTXISD::TexUnified3DS32Float;
3816   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3817     return NVPTXISD::TexUnified3DS32FloatLevel;
3818   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3819     return NVPTXISD::TexUnified3DS32FloatGrad;
3820   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3821     return NVPTXISD::TexUnified3DU32S32;
3822   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3823     return NVPTXISD::TexUnified3DU32Float;
3824   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3825     return NVPTXISD::TexUnified3DU32FloatLevel;
3826   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3827     return NVPTXISD::TexUnified3DU32FloatGrad;
3828 
3829   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3830     return NVPTXISD::TexUnifiedCubeFloatFloat;
3831   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3832     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3833   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3834     return NVPTXISD::TexUnifiedCubeS32Float;
3835   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3836     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3837   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3838     return NVPTXISD::TexUnifiedCubeU32Float;
3839   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3840     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3841 
3842   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3843     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3844   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3845     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3846   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3847     return NVPTXISD::TexUnifiedCubeArrayS32Float;
3848   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3849     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3850   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3851     return NVPTXISD::TexUnifiedCubeArrayU32Float;
3852   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3853     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3854 
3855   case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3856     return NVPTXISD::TexUnifiedCubeFloatFloatGrad;
3857   case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3858     return NVPTXISD::TexUnifiedCubeS32FloatGrad;
3859   case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3860     return NVPTXISD::TexUnifiedCubeU32FloatGrad;
3861   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3862     return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;
3863   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3864     return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;
3865   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3866     return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;
3867 
3868   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3869     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3870   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3871     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3872   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3873     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3874   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3875     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3876   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3877     return NVPTXISD::Tld4UnifiedR2DS64Float;
3878   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3879     return NVPTXISD::Tld4UnifiedG2DS64Float;
3880   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3881     return NVPTXISD::Tld4UnifiedB2DS64Float;
3882   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3883     return NVPTXISD::Tld4UnifiedA2DS64Float;
3884   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3885     return NVPTXISD::Tld4UnifiedR2DU64Float;
3886   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3887     return NVPTXISD::Tld4UnifiedG2DU64Float;
3888   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3889     return NVPTXISD::Tld4UnifiedB2DU64Float;
3890   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3891     return NVPTXISD::Tld4UnifiedA2DU64Float;
3892   }
3893 }
3894 
3895 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3896   switch (Intrinsic) {
3897   default:
3898     return 0;
3899   case Intrinsic::nvvm_suld_1d_i8_clamp:
3900     return NVPTXISD::Suld1DI8Clamp;
3901   case Intrinsic::nvvm_suld_1d_i16_clamp:
3902     return NVPTXISD::Suld1DI16Clamp;
3903   case Intrinsic::nvvm_suld_1d_i32_clamp:
3904     return NVPTXISD::Suld1DI32Clamp;
3905   case Intrinsic::nvvm_suld_1d_i64_clamp:
3906     return NVPTXISD::Suld1DI64Clamp;
3907   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3908     return NVPTXISD::Suld1DV2I8Clamp;
3909   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3910     return NVPTXISD::Suld1DV2I16Clamp;
3911   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3912     return NVPTXISD::Suld1DV2I32Clamp;
3913   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3914     return NVPTXISD::Suld1DV2I64Clamp;
3915   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3916     return NVPTXISD::Suld1DV4I8Clamp;
3917   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3918     return NVPTXISD::Suld1DV4I16Clamp;
3919   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3920     return NVPTXISD::Suld1DV4I32Clamp;
3921   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3922     return NVPTXISD::Suld1DArrayI8Clamp;
3923   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3924     return NVPTXISD::Suld1DArrayI16Clamp;
3925   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3926     return NVPTXISD::Suld1DArrayI32Clamp;
3927   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3928     return NVPTXISD::Suld1DArrayI64Clamp;
3929   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3930     return NVPTXISD::Suld1DArrayV2I8Clamp;
3931   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3932     return NVPTXISD::Suld1DArrayV2I16Clamp;
3933   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3934     return NVPTXISD::Suld1DArrayV2I32Clamp;
3935   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3936     return NVPTXISD::Suld1DArrayV2I64Clamp;
3937   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3938     return NVPTXISD::Suld1DArrayV4I8Clamp;
3939   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3940     return NVPTXISD::Suld1DArrayV4I16Clamp;
3941   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3942     return NVPTXISD::Suld1DArrayV4I32Clamp;
3943   case Intrinsic::nvvm_suld_2d_i8_clamp:
3944     return NVPTXISD::Suld2DI8Clamp;
3945   case Intrinsic::nvvm_suld_2d_i16_clamp:
3946     return NVPTXISD::Suld2DI16Clamp;
3947   case Intrinsic::nvvm_suld_2d_i32_clamp:
3948     return NVPTXISD::Suld2DI32Clamp;
3949   case Intrinsic::nvvm_suld_2d_i64_clamp:
3950     return NVPTXISD::Suld2DI64Clamp;
3951   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3952     return NVPTXISD::Suld2DV2I8Clamp;
3953   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3954     return NVPTXISD::Suld2DV2I16Clamp;
3955   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3956     return NVPTXISD::Suld2DV2I32Clamp;
3957   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3958     return NVPTXISD::Suld2DV2I64Clamp;
3959   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3960     return NVPTXISD::Suld2DV4I8Clamp;
3961   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3962     return NVPTXISD::Suld2DV4I16Clamp;
3963   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3964     return NVPTXISD::Suld2DV4I32Clamp;
3965   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3966     return NVPTXISD::Suld2DArrayI8Clamp;
3967   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3968     return NVPTXISD::Suld2DArrayI16Clamp;
3969   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3970     return NVPTXISD::Suld2DArrayI32Clamp;
3971   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3972     return NVPTXISD::Suld2DArrayI64Clamp;
3973   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3974     return NVPTXISD::Suld2DArrayV2I8Clamp;
3975   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3976     return NVPTXISD::Suld2DArrayV2I16Clamp;
3977   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3978     return NVPTXISD::Suld2DArrayV2I32Clamp;
3979   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3980     return NVPTXISD::Suld2DArrayV2I64Clamp;
3981   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3982     return NVPTXISD::Suld2DArrayV4I8Clamp;
3983   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3984     return NVPTXISD::Suld2DArrayV4I16Clamp;
3985   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3986     return NVPTXISD::Suld2DArrayV4I32Clamp;
3987   case Intrinsic::nvvm_suld_3d_i8_clamp:
3988     return NVPTXISD::Suld3DI8Clamp;
3989   case Intrinsic::nvvm_suld_3d_i16_clamp:
3990     return NVPTXISD::Suld3DI16Clamp;
3991   case Intrinsic::nvvm_suld_3d_i32_clamp:
3992     return NVPTXISD::Suld3DI32Clamp;
3993   case Intrinsic::nvvm_suld_3d_i64_clamp:
3994     return NVPTXISD::Suld3DI64Clamp;
3995   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3996     return NVPTXISD::Suld3DV2I8Clamp;
3997   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3998     return NVPTXISD::Suld3DV2I16Clamp;
3999   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4000     return NVPTXISD::Suld3DV2I32Clamp;
4001   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4002     return NVPTXISD::Suld3DV2I64Clamp;
4003   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4004     return NVPTXISD::Suld3DV4I8Clamp;
4005   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4006     return NVPTXISD::Suld3DV4I16Clamp;
4007   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4008     return NVPTXISD::Suld3DV4I32Clamp;
4009   case Intrinsic::nvvm_suld_1d_i8_trap:
4010     return NVPTXISD::Suld1DI8Trap;
4011   case Intrinsic::nvvm_suld_1d_i16_trap:
4012     return NVPTXISD::Suld1DI16Trap;
4013   case Intrinsic::nvvm_suld_1d_i32_trap:
4014     return NVPTXISD::Suld1DI32Trap;
4015   case Intrinsic::nvvm_suld_1d_i64_trap:
4016     return NVPTXISD::Suld1DI64Trap;
4017   case Intrinsic::nvvm_suld_1d_v2i8_trap:
4018     return NVPTXISD::Suld1DV2I8Trap;
4019   case Intrinsic::nvvm_suld_1d_v2i16_trap:
4020     return NVPTXISD::Suld1DV2I16Trap;
4021   case Intrinsic::nvvm_suld_1d_v2i32_trap:
4022     return NVPTXISD::Suld1DV2I32Trap;
4023   case Intrinsic::nvvm_suld_1d_v2i64_trap:
4024     return NVPTXISD::Suld1DV2I64Trap;
4025   case Intrinsic::nvvm_suld_1d_v4i8_trap:
4026     return NVPTXISD::Suld1DV4I8Trap;
4027   case Intrinsic::nvvm_suld_1d_v4i16_trap:
4028     return NVPTXISD::Suld1DV4I16Trap;
4029   case Intrinsic::nvvm_suld_1d_v4i32_trap:
4030     return NVPTXISD::Suld1DV4I32Trap;
4031   case Intrinsic::nvvm_suld_1d_array_i8_trap:
4032     return NVPTXISD::Suld1DArrayI8Trap;
4033   case Intrinsic::nvvm_suld_1d_array_i16_trap:
4034     return NVPTXISD::Suld1DArrayI16Trap;
4035   case Intrinsic::nvvm_suld_1d_array_i32_trap:
4036     return NVPTXISD::Suld1DArrayI32Trap;
4037   case Intrinsic::nvvm_suld_1d_array_i64_trap:
4038     return NVPTXISD::Suld1DArrayI64Trap;
4039   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4040     return NVPTXISD::Suld1DArrayV2I8Trap;
4041   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4042     return NVPTXISD::Suld1DArrayV2I16Trap;
4043   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4044     return NVPTXISD::Suld1DArrayV2I32Trap;
4045   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4046     return NVPTXISD::Suld1DArrayV2I64Trap;
4047   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4048     return NVPTXISD::Suld1DArrayV4I8Trap;
4049   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4050     return NVPTXISD::Suld1DArrayV4I16Trap;
4051   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4052     return NVPTXISD::Suld1DArrayV4I32Trap;
4053   case Intrinsic::nvvm_suld_2d_i8_trap:
4054     return NVPTXISD::Suld2DI8Trap;
4055   case Intrinsic::nvvm_suld_2d_i16_trap:
4056     return NVPTXISD::Suld2DI16Trap;
4057   case Intrinsic::nvvm_suld_2d_i32_trap:
4058     return NVPTXISD::Suld2DI32Trap;
4059   case Intrinsic::nvvm_suld_2d_i64_trap:
4060     return NVPTXISD::Suld2DI64Trap;
4061   case Intrinsic::nvvm_suld_2d_v2i8_trap:
4062     return NVPTXISD::Suld2DV2I8Trap;
4063   case Intrinsic::nvvm_suld_2d_v2i16_trap:
4064     return NVPTXISD::Suld2DV2I16Trap;
4065   case Intrinsic::nvvm_suld_2d_v2i32_trap:
4066     return NVPTXISD::Suld2DV2I32Trap;
4067   case Intrinsic::nvvm_suld_2d_v2i64_trap:
4068     return NVPTXISD::Suld2DV2I64Trap;
4069   case Intrinsic::nvvm_suld_2d_v4i8_trap:
4070     return NVPTXISD::Suld2DV4I8Trap;
4071   case Intrinsic::nvvm_suld_2d_v4i16_trap:
4072     return NVPTXISD::Suld2DV4I16Trap;
4073   case Intrinsic::nvvm_suld_2d_v4i32_trap:
4074     return NVPTXISD::Suld2DV4I32Trap;
4075   case Intrinsic::nvvm_suld_2d_array_i8_trap:
4076     return NVPTXISD::Suld2DArrayI8Trap;
4077   case Intrinsic::nvvm_suld_2d_array_i16_trap:
4078     return NVPTXISD::Suld2DArrayI16Trap;
4079   case Intrinsic::nvvm_suld_2d_array_i32_trap:
4080     return NVPTXISD::Suld2DArrayI32Trap;
4081   case Intrinsic::nvvm_suld_2d_array_i64_trap:
4082     return NVPTXISD::Suld2DArrayI64Trap;
4083   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4084     return NVPTXISD::Suld2DArrayV2I8Trap;
4085   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4086     return NVPTXISD::Suld2DArrayV2I16Trap;
4087   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4088     return NVPTXISD::Suld2DArrayV2I32Trap;
4089   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4090     return NVPTXISD::Suld2DArrayV2I64Trap;
4091   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4092     return NVPTXISD::Suld2DArrayV4I8Trap;
4093   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4094     return NVPTXISD::Suld2DArrayV4I16Trap;
4095   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4096     return NVPTXISD::Suld2DArrayV4I32Trap;
4097   case Intrinsic::nvvm_suld_3d_i8_trap:
4098     return NVPTXISD::Suld3DI8Trap;
4099   case Intrinsic::nvvm_suld_3d_i16_trap:
4100     return NVPTXISD::Suld3DI16Trap;
4101   case Intrinsic::nvvm_suld_3d_i32_trap:
4102     return NVPTXISD::Suld3DI32Trap;
4103   case Intrinsic::nvvm_suld_3d_i64_trap:
4104     return NVPTXISD::Suld3DI64Trap;
4105   case Intrinsic::nvvm_suld_3d_v2i8_trap:
4106     return NVPTXISD::Suld3DV2I8Trap;
4107   case Intrinsic::nvvm_suld_3d_v2i16_trap:
4108     return NVPTXISD::Suld3DV2I16Trap;
4109   case Intrinsic::nvvm_suld_3d_v2i32_trap:
4110     return NVPTXISD::Suld3DV2I32Trap;
4111   case Intrinsic::nvvm_suld_3d_v2i64_trap:
4112     return NVPTXISD::Suld3DV2I64Trap;
4113   case Intrinsic::nvvm_suld_3d_v4i8_trap:
4114     return NVPTXISD::Suld3DV4I8Trap;
4115   case Intrinsic::nvvm_suld_3d_v4i16_trap:
4116     return NVPTXISD::Suld3DV4I16Trap;
4117   case Intrinsic::nvvm_suld_3d_v4i32_trap:
4118     return NVPTXISD::Suld3DV4I32Trap;
4119   case Intrinsic::nvvm_suld_1d_i8_zero:
4120     return NVPTXISD::Suld1DI8Zero;
4121   case Intrinsic::nvvm_suld_1d_i16_zero:
4122     return NVPTXISD::Suld1DI16Zero;
4123   case Intrinsic::nvvm_suld_1d_i32_zero:
4124     return NVPTXISD::Suld1DI32Zero;
4125   case Intrinsic::nvvm_suld_1d_i64_zero:
4126     return NVPTXISD::Suld1DI64Zero;
4127   case Intrinsic::nvvm_suld_1d_v2i8_zero:
4128     return NVPTXISD::Suld1DV2I8Zero;
4129   case Intrinsic::nvvm_suld_1d_v2i16_zero:
4130     return NVPTXISD::Suld1DV2I16Zero;
4131   case Intrinsic::nvvm_suld_1d_v2i32_zero:
4132     return NVPTXISD::Suld1DV2I32Zero;
4133   case Intrinsic::nvvm_suld_1d_v2i64_zero:
4134     return NVPTXISD::Suld1DV2I64Zero;
4135   case Intrinsic::nvvm_suld_1d_v4i8_zero:
4136     return NVPTXISD::Suld1DV4I8Zero;
4137   case Intrinsic::nvvm_suld_1d_v4i16_zero:
4138     return NVPTXISD::Suld1DV4I16Zero;
4139   case Intrinsic::nvvm_suld_1d_v4i32_zero:
4140     return NVPTXISD::Suld1DV4I32Zero;
4141   case Intrinsic::nvvm_suld_1d_array_i8_zero:
4142     return NVPTXISD::Suld1DArrayI8Zero;
4143   case Intrinsic::nvvm_suld_1d_array_i16_zero:
4144     return NVPTXISD::Suld1DArrayI16Zero;
4145   case Intrinsic::nvvm_suld_1d_array_i32_zero:
4146     return NVPTXISD::Suld1DArrayI32Zero;
4147   case Intrinsic::nvvm_suld_1d_array_i64_zero:
4148     return NVPTXISD::Suld1DArrayI64Zero;
4149   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4150     return NVPTXISD::Suld1DArrayV2I8Zero;
4151   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4152     return NVPTXISD::Suld1DArrayV2I16Zero;
4153   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4154     return NVPTXISD::Suld1DArrayV2I32Zero;
4155   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4156     return NVPTXISD::Suld1DArrayV2I64Zero;
4157   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4158     return NVPTXISD::Suld1DArrayV4I8Zero;
4159   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4160     return NVPTXISD::Suld1DArrayV4I16Zero;
4161   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4162     return NVPTXISD::Suld1DArrayV4I32Zero;
4163   case Intrinsic::nvvm_suld_2d_i8_zero:
4164     return NVPTXISD::Suld2DI8Zero;
4165   case Intrinsic::nvvm_suld_2d_i16_zero:
4166     return NVPTXISD::Suld2DI16Zero;
4167   case Intrinsic::nvvm_suld_2d_i32_zero:
4168     return NVPTXISD::Suld2DI32Zero;
4169   case Intrinsic::nvvm_suld_2d_i64_zero:
4170     return NVPTXISD::Suld2DI64Zero;
4171   case Intrinsic::nvvm_suld_2d_v2i8_zero:
4172     return NVPTXISD::Suld2DV2I8Zero;
4173   case Intrinsic::nvvm_suld_2d_v2i16_zero:
4174     return NVPTXISD::Suld2DV2I16Zero;
4175   case Intrinsic::nvvm_suld_2d_v2i32_zero:
4176     return NVPTXISD::Suld2DV2I32Zero;
4177   case Intrinsic::nvvm_suld_2d_v2i64_zero:
4178     return NVPTXISD::Suld2DV2I64Zero;
4179   case Intrinsic::nvvm_suld_2d_v4i8_zero:
4180     return NVPTXISD::Suld2DV4I8Zero;
4181   case Intrinsic::nvvm_suld_2d_v4i16_zero:
4182     return NVPTXISD::Suld2DV4I16Zero;
4183   case Intrinsic::nvvm_suld_2d_v4i32_zero:
4184     return NVPTXISD::Suld2DV4I32Zero;
4185   case Intrinsic::nvvm_suld_2d_array_i8_zero:
4186     return NVPTXISD::Suld2DArrayI8Zero;
4187   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4188     return NVPTXISD::Suld2DArrayI16Zero;
4189   case Intrinsic::nvvm_suld_2d_array_i32_zero:
4190     return NVPTXISD::Suld2DArrayI32Zero;
4191   case Intrinsic::nvvm_suld_2d_array_i64_zero:
4192     return NVPTXISD::Suld2DArrayI64Zero;
4193   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4194     return NVPTXISD::Suld2DArrayV2I8Zero;
4195   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4196     return NVPTXISD::Suld2DArrayV2I16Zero;
4197   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4198     return NVPTXISD::Suld2DArrayV2I32Zero;
4199   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4200     return NVPTXISD::Suld2DArrayV2I64Zero;
4201   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4202     return NVPTXISD::Suld2DArrayV4I8Zero;
4203   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4204     return NVPTXISD::Suld2DArrayV4I16Zero;
4205   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4206     return NVPTXISD::Suld2DArrayV4I32Zero;
4207   case Intrinsic::nvvm_suld_3d_i8_zero:
4208     return NVPTXISD::Suld3DI8Zero;
4209   case Intrinsic::nvvm_suld_3d_i16_zero:
4210     return NVPTXISD::Suld3DI16Zero;
4211   case Intrinsic::nvvm_suld_3d_i32_zero:
4212     return NVPTXISD::Suld3DI32Zero;
4213   case Intrinsic::nvvm_suld_3d_i64_zero:
4214     return NVPTXISD::Suld3DI64Zero;
4215   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4216     return NVPTXISD::Suld3DV2I8Zero;
4217   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4218     return NVPTXISD::Suld3DV2I16Zero;
4219   case Intrinsic::nvvm_suld_3d_v2i32_zero:
4220     return NVPTXISD::Suld3DV2I32Zero;
4221   case Intrinsic::nvvm_suld_3d_v2i64_zero:
4222     return NVPTXISD::Suld3DV2I64Zero;
4223   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4224     return NVPTXISD::Suld3DV4I8Zero;
4225   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4226     return NVPTXISD::Suld3DV4I16Zero;
4227   case Intrinsic::nvvm_suld_3d_v4i32_zero:
4228     return NVPTXISD::Suld3DV4I32Zero;
4229   }
4230 }
4231 
4232 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4233 // TgtMemIntrinsic
4234 // because we need the information that is only available in the "Value" type
4235 // of destination
4236 // pointer. In particular, the address space information.
4237 bool NVPTXTargetLowering::getTgtMemIntrinsic(
4238     IntrinsicInfo &Info, const CallInst &I,
4239     MachineFunction &MF, unsigned Intrinsic) const {
4240   switch (Intrinsic) {
4241   default:
4242     return false;
4243   case Intrinsic::nvvm_match_all_sync_i32p:
4244   case Intrinsic::nvvm_match_all_sync_i64p:
4245     Info.opc = ISD::INTRINSIC_W_CHAIN;
4246     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4247     // in order to model data exchange with other threads, but perform no real
4248     // memory accesses.
4249     Info.memVT = MVT::i1;
4250 
4251     // Our result depends on both our and other thread's arguments.
4252     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4253     return true;
4254   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4255   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4256   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4257   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4258   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4259   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4260   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4261   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4262   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4263   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4264   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4265   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4266   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4267   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4268   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4269   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4270   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4271   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4272   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4273   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4274   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4275   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4276   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4277   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4278     Info.opc = ISD::INTRINSIC_W_CHAIN;
4279     Info.memVT = MVT::v8f16;
4280     Info.ptrVal = I.getArgOperand(0);
4281     Info.offset = 0;
4282     Info.flags = MachineMemOperand::MOLoad;
4283     Info.align = Align(16);
4284     return true;
4285   }
4286   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4287   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4288   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4289   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4290   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4291   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4292   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4293   case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4294   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4295   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4296   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4297   case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4298   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4299   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4300   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4301   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4302   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4303   case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4304   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4305   case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4306   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4307   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4308   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4309   case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4310     Info.opc = ISD::INTRINSIC_W_CHAIN;
4311     Info.memVT = MVT::v2i32;
4312     Info.ptrVal = I.getArgOperand(0);
4313     Info.offset = 0;
4314     Info.flags = MachineMemOperand::MOLoad;
4315     Info.align = Align(8);
4316     return true;
4317   }
4318 
4319   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4320   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4321   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4322   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4323   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4324   case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4325   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4326   case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4327   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4328   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4329   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4330   case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4331   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4332   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4333   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4334   case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4335 
4336   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4337   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4338   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4339   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4340   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4341   case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4342   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4343   case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4344   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4345   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4346   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4347   case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4348   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4349   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4350   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4351   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4352   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4353   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4354     Info.opc = ISD::INTRINSIC_W_CHAIN;
4355     Info.memVT = MVT::v4i32;
4356     Info.ptrVal = I.getArgOperand(0);
4357     Info.offset = 0;
4358     Info.flags = MachineMemOperand::MOLoad;
4359     Info.align = Align(16);
4360     return true;
4361   }
4362 
4363   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4364   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4365   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4366   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4367   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4368   case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4369   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4370   case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4371 
4372   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4373   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4374   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4375   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4376   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4377   case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4378   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4379   case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4380   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4381   case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4382   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4383   case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4384   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4385   case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4386   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4387   case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4388   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4389   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4390   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4391   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4392   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4393   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4394     Info.opc = ISD::INTRINSIC_W_CHAIN;
4395     Info.memVT = MVT::i32;
4396     Info.ptrVal = I.getArgOperand(0);
4397     Info.offset = 0;
4398     Info.flags = MachineMemOperand::MOLoad;
4399     Info.align = Align(4);
4400     return true;
4401   }
4402 
4403   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4404   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4405   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4406   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4407   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4408   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4409   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4410   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4411   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4412   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4413   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4414   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4415     Info.opc = ISD::INTRINSIC_W_CHAIN;
4416     Info.memVT = MVT::v4f16;
4417     Info.ptrVal = I.getArgOperand(0);
4418     Info.offset = 0;
4419     Info.flags = MachineMemOperand::MOLoad;
4420     Info.align = Align(16);
4421     return true;
4422   }
4423 
4424   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4425   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4426   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4427   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4428   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4429   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4430   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4431   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4432   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4433   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4434   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4435   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4436   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4437   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4438   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4439   case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4440     Info.opc = ISD::INTRINSIC_W_CHAIN;
4441     Info.memVT = MVT::v8f32;
4442     Info.ptrVal = I.getArgOperand(0);
4443     Info.offset = 0;
4444     Info.flags = MachineMemOperand::MOLoad;
4445     Info.align = Align(16);
4446     return true;
4447   }
4448 
4449   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4450   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4451   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4452   case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4453 
4454   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4455   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4456   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4457   case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4458 
4459   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4460   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4461   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4462   case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4463   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4464   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4465   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4466   case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4467   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4468   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4469   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4470   case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4471     Info.opc = ISD::INTRINSIC_W_CHAIN;
4472     Info.memVT = MVT::v8i32;
4473     Info.ptrVal = I.getArgOperand(0);
4474     Info.offset = 0;
4475     Info.flags = MachineMemOperand::MOLoad;
4476     Info.align = Align(16);
4477     return true;
4478   }
4479 
4480   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4481   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4482   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4483   case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4484   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4485   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4486   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4487   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4488   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4489   case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4490     Info.opc = ISD::INTRINSIC_W_CHAIN;
4491     Info.memVT = MVT::v2i32;
4492     Info.ptrVal = I.getArgOperand(0);
4493     Info.offset = 0;
4494     Info.flags = MachineMemOperand::MOLoad;
4495     Info.align = Align(8);
4496     return true;
4497   }
4498 
4499   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4500   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4501   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4502   case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4503 
4504   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4505   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4506   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4507   case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4508     Info.opc = ISD::INTRINSIC_W_CHAIN;
4509     Info.memVT = MVT::f64;
4510     Info.ptrVal = I.getArgOperand(0);
4511     Info.offset = 0;
4512     Info.flags = MachineMemOperand::MOLoad;
4513     Info.align = Align(8);
4514     return true;
4515   }
4516 
4517   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4518   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4519   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4520   case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4521     Info.opc = ISD::INTRINSIC_W_CHAIN;
4522     Info.memVT = MVT::v2f64;
4523     Info.ptrVal = I.getArgOperand(0);
4524     Info.offset = 0;
4525     Info.flags = MachineMemOperand::MOLoad;
4526     Info.align = Align(16);
4527     return true;
4528   }
4529 
4530   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4531   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4532   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4533   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4534   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4535   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4536   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4537   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4538   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4539   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4540   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4541   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4542     Info.opc = ISD::INTRINSIC_VOID;
4543     Info.memVT = MVT::v4f16;
4544     Info.ptrVal = I.getArgOperand(0);
4545     Info.offset = 0;
4546     Info.flags = MachineMemOperand::MOStore;
4547     Info.align = Align(16);
4548     return true;
4549   }
4550 
4551   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4552   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4553   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4554   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4555   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4556   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4557   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4558   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4559   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4560   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4561   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4562   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4563   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4564   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4565   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4566   case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4567     Info.opc = ISD::INTRINSIC_VOID;
4568     Info.memVT = MVT::v8f32;
4569     Info.ptrVal = I.getArgOperand(0);
4570     Info.offset = 0;
4571     Info.flags = MachineMemOperand::MOStore;
4572     Info.align = Align(16);
4573     return true;
4574   }
4575 
4576   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4577   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4578   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4579   case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4580   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4581   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4582   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4583   case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4584   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4585   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4586   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4587   case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4588     Info.opc = ISD::INTRINSIC_VOID;
4589     Info.memVT = MVT::v8i32;
4590     Info.ptrVal = I.getArgOperand(0);
4591     Info.offset = 0;
4592     Info.flags = MachineMemOperand::MOStore;
4593     Info.align = Align(16);
4594     return true;
4595   }
4596 
4597   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4598   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4599   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4600   case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4601   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4602   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4603   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4604   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4605     Info.opc = ISD::INTRINSIC_VOID;
4606     Info.memVT = MVT::v2i32;
4607     Info.ptrVal = I.getArgOperand(0);
4608     Info.offset = 0;
4609     Info.flags = MachineMemOperand::MOStore;
4610     Info.align = Align(8);
4611     return true;
4612   }
4613 
4614   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4615   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4616   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4617   case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4618     Info.opc = ISD::INTRINSIC_VOID;
4619     Info.memVT = MVT::v2f64;
4620     Info.ptrVal = I.getArgOperand(0);
4621     Info.offset = 0;
4622     Info.flags = MachineMemOperand::MOStore;
4623     Info.align = Align(16);
4624     return true;
4625   }
4626 
4627   case Intrinsic::nvvm_atomic_load_inc_32:
4628   case Intrinsic::nvvm_atomic_load_dec_32:
4629 
4630   case Intrinsic::nvvm_atomic_add_gen_f_cta:
4631   case Intrinsic::nvvm_atomic_add_gen_f_sys:
4632   case Intrinsic::nvvm_atomic_add_gen_i_cta:
4633   case Intrinsic::nvvm_atomic_add_gen_i_sys:
4634   case Intrinsic::nvvm_atomic_and_gen_i_cta:
4635   case Intrinsic::nvvm_atomic_and_gen_i_sys:
4636   case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4637   case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4638   case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4639   case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4640   case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4641   case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4642   case Intrinsic::nvvm_atomic_max_gen_i_cta:
4643   case Intrinsic::nvvm_atomic_max_gen_i_sys:
4644   case Intrinsic::nvvm_atomic_min_gen_i_cta:
4645   case Intrinsic::nvvm_atomic_min_gen_i_sys:
4646   case Intrinsic::nvvm_atomic_or_gen_i_cta:
4647   case Intrinsic::nvvm_atomic_or_gen_i_sys:
4648   case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4649   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4650   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4651   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4652     auto &DL = I.getDataLayout();
4653     Info.opc = ISD::INTRINSIC_W_CHAIN;
4654     Info.memVT = getValueType(DL, I.getType());
4655     Info.ptrVal = I.getArgOperand(0);
4656     Info.offset = 0;
4657     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4658     Info.align.reset();
4659     return true;
4660   }
4661 
4662   case Intrinsic::nvvm_ldu_global_i:
4663   case Intrinsic::nvvm_ldu_global_f:
4664   case Intrinsic::nvvm_ldu_global_p: {
4665     auto &DL = I.getDataLayout();
4666     Info.opc = ISD::INTRINSIC_W_CHAIN;
4667     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4668       Info.memVT = getValueType(DL, I.getType());
4669     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4670       Info.memVT = getPointerTy(DL);
4671     else
4672       Info.memVT = getValueType(DL, I.getType());
4673     Info.ptrVal = I.getArgOperand(0);
4674     Info.offset = 0;
4675     Info.flags = MachineMemOperand::MOLoad;
4676     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4677 
4678     return true;
4679   }
4680   case Intrinsic::nvvm_ldg_global_i:
4681   case Intrinsic::nvvm_ldg_global_f:
4682   case Intrinsic::nvvm_ldg_global_p: {
4683     auto &DL = I.getDataLayout();
4684 
4685     Info.opc = ISD::INTRINSIC_W_CHAIN;
4686     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4687       Info.memVT = getValueType(DL, I.getType());
4688     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4689       Info.memVT = getPointerTy(DL);
4690     else
4691       Info.memVT = getValueType(DL, I.getType());
4692     Info.ptrVal = I.getArgOperand(0);
4693     Info.offset = 0;
4694     Info.flags = MachineMemOperand::MOLoad;
4695     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4696 
4697     return true;
4698   }
4699 
4700   case Intrinsic::nvvm_tex_1d_v4f32_s32:
4701   case Intrinsic::nvvm_tex_1d_v4f32_f32:
4702   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4703   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4704   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4705   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4706   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4707   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4708   case Intrinsic::nvvm_tex_2d_v4f32_s32:
4709   case Intrinsic::nvvm_tex_2d_v4f32_f32:
4710   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4711   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4712   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4713   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4714   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4715   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4716   case Intrinsic::nvvm_tex_3d_v4f32_s32:
4717   case Intrinsic::nvvm_tex_3d_v4f32_f32:
4718   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4719   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4720   case Intrinsic::nvvm_tex_cube_v4f32_f32:
4721   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4722   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4723   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4724   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4725   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4726   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4727   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4728   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4729   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4730   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4731   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4732   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4733   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4734   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4735   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4736   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4737   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4738   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4739   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4740   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4741   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4742   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4743   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4744   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4745   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4746   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4747   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4748   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4749   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4750   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4751   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4752   case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4753   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4754   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4755   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4756   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4757   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4758     Info.opc = getOpcForTextureInstr(Intrinsic);
4759     Info.memVT = MVT::v4f32;
4760     Info.ptrVal = nullptr;
4761     Info.offset = 0;
4762     Info.flags = MachineMemOperand::MOLoad;
4763     Info.align = Align(16);
4764     return true;
4765 
4766   case Intrinsic::nvvm_tex_1d_v4s32_s32:
4767   case Intrinsic::nvvm_tex_1d_v4s32_f32:
4768   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4769   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4770   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4771   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4772   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4773   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4774   case Intrinsic::nvvm_tex_2d_v4s32_s32:
4775   case Intrinsic::nvvm_tex_2d_v4s32_f32:
4776   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4777   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4778   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4779   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4780   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4781   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4782   case Intrinsic::nvvm_tex_3d_v4s32_s32:
4783   case Intrinsic::nvvm_tex_3d_v4s32_f32:
4784   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4785   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4786   case Intrinsic::nvvm_tex_cube_v4s32_f32:
4787   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4788   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4789   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4790   case Intrinsic::nvvm_tex_cube_v4u32_f32:
4791   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4792   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4793   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4794   case Intrinsic::nvvm_tex_1d_v4u32_s32:
4795   case Intrinsic::nvvm_tex_1d_v4u32_f32:
4796   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4797   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4798   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4799   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4800   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4801   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4802   case Intrinsic::nvvm_tex_2d_v4u32_s32:
4803   case Intrinsic::nvvm_tex_2d_v4u32_f32:
4804   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4805   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4806   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4807   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4808   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4809   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4810   case Intrinsic::nvvm_tex_3d_v4u32_s32:
4811   case Intrinsic::nvvm_tex_3d_v4u32_f32:
4812   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4813   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4814   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4815   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4816   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4817   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4818   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4819   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4820   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4821   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4822   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4823   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4824   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4825   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4826   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4827   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4828   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4829   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4830   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4831   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4832   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4833   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4834   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4835   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4836   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4837   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4838   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4839   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4840   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4841   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4842   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4843   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4844   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4845   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4846   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4847   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4848   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4849   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4850   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4851   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4852   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4853   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4854   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4855   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4856   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4857   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4858   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4859   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4860   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4861   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4862   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4863   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4864   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4865   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4866   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4867   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4868   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4869   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4870   case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4871   case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4872   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4873   case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4874   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4875   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4876   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4877   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4878   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4879   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4880   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4881   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4882     Info.opc = getOpcForTextureInstr(Intrinsic);
4883     Info.memVT = MVT::v4i32;
4884     Info.ptrVal = nullptr;
4885     Info.offset = 0;
4886     Info.flags = MachineMemOperand::MOLoad;
4887     Info.align = Align(16);
4888     return true;
4889 
4890   case Intrinsic::nvvm_suld_1d_i8_clamp:
4891   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4892   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4893   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4894   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4895   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4896   case Intrinsic::nvvm_suld_2d_i8_clamp:
4897   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4898   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4899   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4900   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4901   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4902   case Intrinsic::nvvm_suld_3d_i8_clamp:
4903   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4904   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4905   case Intrinsic::nvvm_suld_1d_i8_trap:
4906   case Intrinsic::nvvm_suld_1d_v2i8_trap:
4907   case Intrinsic::nvvm_suld_1d_v4i8_trap:
4908   case Intrinsic::nvvm_suld_1d_array_i8_trap:
4909   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4910   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4911   case Intrinsic::nvvm_suld_2d_i8_trap:
4912   case Intrinsic::nvvm_suld_2d_v2i8_trap:
4913   case Intrinsic::nvvm_suld_2d_v4i8_trap:
4914   case Intrinsic::nvvm_suld_2d_array_i8_trap:
4915   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4916   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4917   case Intrinsic::nvvm_suld_3d_i8_trap:
4918   case Intrinsic::nvvm_suld_3d_v2i8_trap:
4919   case Intrinsic::nvvm_suld_3d_v4i8_trap:
4920   case Intrinsic::nvvm_suld_1d_i8_zero:
4921   case Intrinsic::nvvm_suld_1d_v2i8_zero:
4922   case Intrinsic::nvvm_suld_1d_v4i8_zero:
4923   case Intrinsic::nvvm_suld_1d_array_i8_zero:
4924   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4925   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4926   case Intrinsic::nvvm_suld_2d_i8_zero:
4927   case Intrinsic::nvvm_suld_2d_v2i8_zero:
4928   case Intrinsic::nvvm_suld_2d_v4i8_zero:
4929   case Intrinsic::nvvm_suld_2d_array_i8_zero:
4930   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4931   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4932   case Intrinsic::nvvm_suld_3d_i8_zero:
4933   case Intrinsic::nvvm_suld_3d_v2i8_zero:
4934   case Intrinsic::nvvm_suld_3d_v4i8_zero:
4935     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4936     Info.memVT = MVT::i8;
4937     Info.ptrVal = nullptr;
4938     Info.offset = 0;
4939     Info.flags = MachineMemOperand::MOLoad;
4940     Info.align = Align(16);
4941     return true;
4942 
4943   case Intrinsic::nvvm_suld_1d_i16_clamp:
4944   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4945   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4946   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4947   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4948   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4949   case Intrinsic::nvvm_suld_2d_i16_clamp:
4950   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4951   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4952   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4953   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4954   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4955   case Intrinsic::nvvm_suld_3d_i16_clamp:
4956   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4957   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4958   case Intrinsic::nvvm_suld_1d_i16_trap:
4959   case Intrinsic::nvvm_suld_1d_v2i16_trap:
4960   case Intrinsic::nvvm_suld_1d_v4i16_trap:
4961   case Intrinsic::nvvm_suld_1d_array_i16_trap:
4962   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4963   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4964   case Intrinsic::nvvm_suld_2d_i16_trap:
4965   case Intrinsic::nvvm_suld_2d_v2i16_trap:
4966   case Intrinsic::nvvm_suld_2d_v4i16_trap:
4967   case Intrinsic::nvvm_suld_2d_array_i16_trap:
4968   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4969   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4970   case Intrinsic::nvvm_suld_3d_i16_trap:
4971   case Intrinsic::nvvm_suld_3d_v2i16_trap:
4972   case Intrinsic::nvvm_suld_3d_v4i16_trap:
4973   case Intrinsic::nvvm_suld_1d_i16_zero:
4974   case Intrinsic::nvvm_suld_1d_v2i16_zero:
4975   case Intrinsic::nvvm_suld_1d_v4i16_zero:
4976   case Intrinsic::nvvm_suld_1d_array_i16_zero:
4977   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4978   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4979   case Intrinsic::nvvm_suld_2d_i16_zero:
4980   case Intrinsic::nvvm_suld_2d_v2i16_zero:
4981   case Intrinsic::nvvm_suld_2d_v4i16_zero:
4982   case Intrinsic::nvvm_suld_2d_array_i16_zero:
4983   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4984   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4985   case Intrinsic::nvvm_suld_3d_i16_zero:
4986   case Intrinsic::nvvm_suld_3d_v2i16_zero:
4987   case Intrinsic::nvvm_suld_3d_v4i16_zero:
4988     Info.opc = getOpcForSurfaceInstr(Intrinsic);
4989     Info.memVT = MVT::i16;
4990     Info.ptrVal = nullptr;
4991     Info.offset = 0;
4992     Info.flags = MachineMemOperand::MOLoad;
4993     Info.align = Align(16);
4994     return true;
4995 
4996   case Intrinsic::nvvm_suld_1d_i32_clamp:
4997   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4998   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4999   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5000   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5001   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5002   case Intrinsic::nvvm_suld_2d_i32_clamp:
5003   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5004   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5005   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5006   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5007   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5008   case Intrinsic::nvvm_suld_3d_i32_clamp:
5009   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5010   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5011   case Intrinsic::nvvm_suld_1d_i32_trap:
5012   case Intrinsic::nvvm_suld_1d_v2i32_trap:
5013   case Intrinsic::nvvm_suld_1d_v4i32_trap:
5014   case Intrinsic::nvvm_suld_1d_array_i32_trap:
5015   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5016   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5017   case Intrinsic::nvvm_suld_2d_i32_trap:
5018   case Intrinsic::nvvm_suld_2d_v2i32_trap:
5019   case Intrinsic::nvvm_suld_2d_v4i32_trap:
5020   case Intrinsic::nvvm_suld_2d_array_i32_trap:
5021   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5022   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5023   case Intrinsic::nvvm_suld_3d_i32_trap:
5024   case Intrinsic::nvvm_suld_3d_v2i32_trap:
5025   case Intrinsic::nvvm_suld_3d_v4i32_trap:
5026   case Intrinsic::nvvm_suld_1d_i32_zero:
5027   case Intrinsic::nvvm_suld_1d_v2i32_zero:
5028   case Intrinsic::nvvm_suld_1d_v4i32_zero:
5029   case Intrinsic::nvvm_suld_1d_array_i32_zero:
5030   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5031   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5032   case Intrinsic::nvvm_suld_2d_i32_zero:
5033   case Intrinsic::nvvm_suld_2d_v2i32_zero:
5034   case Intrinsic::nvvm_suld_2d_v4i32_zero:
5035   case Intrinsic::nvvm_suld_2d_array_i32_zero:
5036   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5037   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5038   case Intrinsic::nvvm_suld_3d_i32_zero:
5039   case Intrinsic::nvvm_suld_3d_v2i32_zero:
5040   case Intrinsic::nvvm_suld_3d_v4i32_zero:
5041     Info.opc = getOpcForSurfaceInstr(Intrinsic);
5042     Info.memVT = MVT::i32;
5043     Info.ptrVal = nullptr;
5044     Info.offset = 0;
5045     Info.flags = MachineMemOperand::MOLoad;
5046     Info.align = Align(16);
5047     return true;
5048 
5049   case Intrinsic::nvvm_suld_1d_i64_clamp:
5050   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5051   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5052   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5053   case Intrinsic::nvvm_suld_2d_i64_clamp:
5054   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5055   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5056   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5057   case Intrinsic::nvvm_suld_3d_i64_clamp:
5058   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5059   case Intrinsic::nvvm_suld_1d_i64_trap:
5060   case Intrinsic::nvvm_suld_1d_v2i64_trap:
5061   case Intrinsic::nvvm_suld_1d_array_i64_trap:
5062   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5063   case Intrinsic::nvvm_suld_2d_i64_trap:
5064   case Intrinsic::nvvm_suld_2d_v2i64_trap:
5065   case Intrinsic::nvvm_suld_2d_array_i64_trap:
5066   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5067   case Intrinsic::nvvm_suld_3d_i64_trap:
5068   case Intrinsic::nvvm_suld_3d_v2i64_trap:
5069   case Intrinsic::nvvm_suld_1d_i64_zero:
5070   case Intrinsic::nvvm_suld_1d_v2i64_zero:
5071   case Intrinsic::nvvm_suld_1d_array_i64_zero:
5072   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5073   case Intrinsic::nvvm_suld_2d_i64_zero:
5074   case Intrinsic::nvvm_suld_2d_v2i64_zero:
5075   case Intrinsic::nvvm_suld_2d_array_i64_zero:
5076   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5077   case Intrinsic::nvvm_suld_3d_i64_zero:
5078   case Intrinsic::nvvm_suld_3d_v2i64_zero:
5079     Info.opc = getOpcForSurfaceInstr(Intrinsic);
5080     Info.memVT = MVT::i64;
5081     Info.ptrVal = nullptr;
5082     Info.offset = 0;
5083     Info.flags = MachineMemOperand::MOLoad;
5084     Info.align = Align(16);
5085     return true;
5086   }
5087   return false;
5088 }
5089 
5090 /// getFunctionParamOptimizedAlign - since function arguments are passed via
5091 /// .param space, we may want to increase their alignment in a way that
5092 /// ensures that we can effectively vectorize their loads & stores. We can
5093 /// increase alignment only if the function has internal or has private
5094 /// linkage as for other linkage types callers may already rely on default
5095 /// alignment. To allow using 128-bit vectorized loads/stores, this function
5096 /// ensures that alignment is 16 or greater.
5097 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
5098     const Function *F, Type *ArgTy, const DataLayout &DL) const {
5099   // Capping the alignment to 128 bytes as that is the maximum alignment
5100   // supported by PTX.
5101   const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5102 
5103   // If a function has linkage different from internal or private, we
5104   // must use default ABI alignment as external users rely on it. Same
5105   // for a function that may be called from a function pointer.
5106   if (!F || !F->hasLocalLinkage() ||
5107       F->hasAddressTaken(/*Users=*/nullptr,
5108                          /*IgnoreCallbackUses=*/false,
5109                          /*IgnoreAssumeLikeCalls=*/true,
5110                          /*IgnoreLLVMUsed=*/true))
5111     return ABITypeAlign;
5112 
5113   assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5114   return std::max(Align(16), ABITypeAlign);
5115 }
5116 
5117 /// Helper for computing alignment of a device function byval parameter.
5118 Align NVPTXTargetLowering::getFunctionByValParamAlign(
5119     const Function *F, Type *ArgTy, Align InitialAlign,
5120     const DataLayout &DL) const {
5121   Align ArgAlign = InitialAlign;
5122   // Try to increase alignment to enhance vectorization options.
5123   if (F)
5124     ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5125 
5126   // Old ptx versions have a bug. When PTX code takes address of
5127   // byval parameter with alignment < 4, ptxas generates code to
5128   // spill argument into memory. Alas on sm_50+ ptxas generates
5129   // SASS code that fails with misaligned access. To work around
5130   // the problem, make sure that we align byval parameters by at
5131   // least 4. This bug seems to be fixed at least starting from
5132   // ptxas > 9.0.
5133   // TODO: remove this after verifying the bug is not reproduced
5134   // on non-deprecated ptxas versions.
5135   if (ForceMinByValParamAlign)
5136     ArgAlign = std::max(ArgAlign, Align(4));
5137 
5138   return ArgAlign;
5139 }
5140 
5141 // Helper for getting a function parameter name. Name is composed from
5142 // its index and the function name. Negative index corresponds to special
5143 // parameter (unsized array) used for passing variable arguments.
5144 std::string NVPTXTargetLowering::getParamName(const Function *F,
5145                                               int Idx) const {
5146   std::string ParamName;
5147   raw_string_ostream ParamStr(ParamName);
5148 
5149   ParamStr << getTargetMachine().getSymbol(F)->getName();
5150   if (Idx < 0)
5151     ParamStr << "_vararg";
5152   else
5153     ParamStr << "_param_" << Idx;
5154 
5155   return ParamName;
5156 }
5157 
5158 /// isLegalAddressingMode - Return true if the addressing mode represented
5159 /// by AM is legal for this target, for a load/store of the specified type.
5160 /// Used to guide target specific optimizations, like loop strength reduction
5161 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
5162 /// (CodeGenPrepare.cpp)
5163 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
5164                                                 const AddrMode &AM, Type *Ty,
5165                                                 unsigned AS, Instruction *I) const {
5166   // AddrMode - This represents an addressing mode of:
5167   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5168   //
5169   // The legal address modes are
5170   // - [avar]
5171   // - [areg]
5172   // - [areg+immoff]
5173   // - [immAddr]
5174 
5175   // immoff must fit in a signed 32-bit int
5176   if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5177     return false;
5178 
5179   if (AM.BaseGV)
5180     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5181 
5182   switch (AM.Scale) {
5183   case 0: // "r", "r+i" or "i" is allowed
5184     break;
5185   case 1:
5186     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5187       return false;
5188     // Otherwise we have r+i.
5189     break;
5190   default:
5191     // No scale > 1 is allowed
5192     return false;
5193   }
5194   return true;
5195 }
5196 
5197 //===----------------------------------------------------------------------===//
5198 //                         NVPTX Inline Assembly Support
5199 //===----------------------------------------------------------------------===//
5200 
5201 /// getConstraintType - Given a constraint letter, return the type of
5202 /// constraint it is for this target.
5203 NVPTXTargetLowering::ConstraintType
5204 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
5205   if (Constraint.size() == 1) {
5206     switch (Constraint[0]) {
5207     default:
5208       break;
5209     case 'b':
5210     case 'r':
5211     case 'h':
5212     case 'c':
5213     case 'l':
5214     case 'f':
5215     case 'd':
5216     case 'q':
5217     case '0':
5218     case 'N':
5219       return C_RegisterClass;
5220     }
5221   }
5222   return TargetLowering::getConstraintType(Constraint);
5223 }
5224 
5225 std::pair<unsigned, const TargetRegisterClass *>
5226 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5227                                                   StringRef Constraint,
5228                                                   MVT VT) const {
5229   if (Constraint.size() == 1) {
5230     switch (Constraint[0]) {
5231     case 'b':
5232       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5233     case 'c':
5234       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5235     case 'h':
5236       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5237     case 'r':
5238       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5239     case 'l':
5240     case 'N':
5241       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5242     case 'q': {
5243       if (STI.getSmVersion() < 70)
5244         report_fatal_error("Inline asm with 128 bit operands is only "
5245                            "supported for sm_70 and higher!");
5246       return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
5247     }
5248     case 'f':
5249       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5250     case 'd':
5251       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5252     }
5253   }
5254   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5255 }
5256 
5257 //===----------------------------------------------------------------------===//
5258 //                         NVPTX DAG Combining
5259 //===----------------------------------------------------------------------===//
5260 
5261 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5262                                    CodeGenOptLevel OptLevel) const {
5263   // Always honor command-line argument
5264   if (FMAContractLevelOpt.getNumOccurrences() > 0)
5265     return FMAContractLevelOpt > 0;
5266 
5267   // Do not contract if we're not optimizing the code.
5268   if (OptLevel == CodeGenOptLevel::None)
5269     return false;
5270 
5271   // Honor TargetOptions flags that explicitly say fusion is okay.
5272   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5273     return true;
5274 
5275   return allowUnsafeFPMath(MF);
5276 }
5277 
5278 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5279   // Honor TargetOptions flags that explicitly say unsafe math is okay.
5280   if (MF.getTarget().Options.UnsafeFPMath)
5281     return true;
5282 
5283   // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5284   const Function &F = MF.getFunction();
5285   return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5286 }
5287 
5288 static bool isConstZero(const SDValue &Operand) {
5289   const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5290   return Const && Const->getZExtValue() == 0;
5291 }
5292 
5293 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5294 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
5295 /// called with the default operands, and if that fails, with commuted
5296 /// operands.
5297 static SDValue
5298 PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5299                               TargetLowering::DAGCombinerInfo &DCI) {
5300   EVT VT = N0.getValueType();
5301 
5302   // Since integer multiply-add costs the same as integer multiply
5303   // but is more costly than integer add, do the fusion only when
5304   // the mul is only used in the add.
5305   // TODO: this may not be true for later architectures, consider relaxing this
5306   if (!N0.getNode()->hasOneUse())
5307     return SDValue();
5308 
5309   // fold (add (mul a, b), c) -> (mad a, b, c)
5310   //
5311   if (N0.getOpcode() == ISD::MUL)
5312     return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
5313                            N0.getOperand(1), N1);
5314 
5315   // fold (add (select cond, 0, (mul a, b)), c)
5316   //   -> (select cond, c, (mad a, b, c))
5317   //
5318   if (N0.getOpcode() == ISD::SELECT) {
5319     unsigned ZeroOpNum;
5320     if (isConstZero(N0->getOperand(1)))
5321       ZeroOpNum = 1;
5322     else if (isConstZero(N0->getOperand(2)))
5323       ZeroOpNum = 2;
5324     else
5325       return SDValue();
5326 
5327     SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5328     if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5329       return SDValue();
5330 
5331     SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5332                                   M->getOperand(0), M->getOperand(1), N1);
5333     return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5334                              ((ZeroOpNum == 1) ? N1 : MAD),
5335                              ((ZeroOpNum == 1) ? MAD : N1));
5336   }
5337 
5338   return SDValue();
5339 }
5340 
5341 static SDValue
5342 PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5343                                TargetLowering::DAGCombinerInfo &DCI,
5344                                CodeGenOptLevel OptLevel) {
5345   EVT VT = N0.getValueType();
5346   if (N0.getOpcode() == ISD::FMUL) {
5347     const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5348         &DCI.DAG.getTargetLoweringInfo());
5349     if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
5350       return SDValue();
5351 
5352     // For floating point:
5353     // Do the fusion only when the mul has less than 5 uses and all
5354     // are add.
5355     // The heuristic is that if a use is not an add, then that use
5356     // cannot be fused into fma, therefore mul is still needed anyway.
5357     // If there are more than 4 uses, even if they are all add, fusing
5358     // them will increase register pressue.
5359     //
5360     int numUses = 0;
5361     int nonAddCount = 0;
5362     for (const SDNode *User : N0.getNode()->uses()) {
5363       numUses++;
5364       if (User->getOpcode() != ISD::FADD)
5365         ++nonAddCount;
5366       if (numUses >= 5)
5367         return SDValue();
5368     }
5369     if (nonAddCount) {
5370       int orderNo = N->getIROrder();
5371       int orderNo2 = N0.getNode()->getIROrder();
5372       // simple heuristics here for considering potential register
5373       // pressure, the logics here is that the differnce are used
5374       // to measure the distance between def and use, the longer distance
5375       // more likely cause register pressure.
5376       if (orderNo - orderNo2 < 500)
5377         return SDValue();
5378 
5379       // Now, check if at least one of the FMUL's operands is live beyond the
5380       // node N, which guarantees that the FMA will not increase register
5381       // pressure at node N.
5382       bool opIsLive = false;
5383       const SDNode *left = N0.getOperand(0).getNode();
5384       const SDNode *right = N0.getOperand(1).getNode();
5385 
5386       if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5387         opIsLive = true;
5388 
5389       if (!opIsLive)
5390         for (const SDNode *User : left->uses()) {
5391           int orderNo3 = User->getIROrder();
5392           if (orderNo3 > orderNo) {
5393             opIsLive = true;
5394             break;
5395           }
5396         }
5397 
5398       if (!opIsLive)
5399         for (const SDNode *User : right->uses()) {
5400           int orderNo3 = User->getIROrder();
5401           if (orderNo3 > orderNo) {
5402             opIsLive = true;
5403             break;
5404           }
5405         }
5406 
5407       if (!opIsLive)
5408         return SDValue();
5409     }
5410 
5411     return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5412                            N0.getOperand(1), N1);
5413   }
5414 
5415   return SDValue();
5416 }
5417 
5418 static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5419                                          std::size_t Back) {
5420   if (all_of(N->ops().drop_front(Front).drop_back(Back),
5421              [](const SDUse &U) { return U.get()->isUndef(); }))
5422     // Operand 0 is the previous value in the chain. Cannot return EntryToken
5423     // as the previous value will become unused and eliminated later.
5424     return N->getOperand(0);
5425 
5426   return SDValue();
5427 }
5428 
5429 static SDValue PerformStoreParamCombine(SDNode *N) {
5430   // Operands from the 3rd to the 2nd last one are the values to be stored.
5431   //   {Chain, ArgID, Offset, Val, Glue}
5432   return PerformStoreCombineHelper(N, 3, 1);
5433 }
5434 
5435 static SDValue PerformStoreRetvalCombine(SDNode *N) {
5436   // Operands from the 2nd to the last one are the values to be stored
5437   return PerformStoreCombineHelper(N, 2, 0);
5438 }
5439 
5440 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5441 ///
5442 static SDValue PerformADDCombine(SDNode *N,
5443                                  TargetLowering::DAGCombinerInfo &DCI,
5444                                  CodeGenOptLevel OptLevel) {
5445   if (OptLevel == CodeGenOptLevel::None)
5446     return SDValue();
5447 
5448   SDValue N0 = N->getOperand(0);
5449   SDValue N1 = N->getOperand(1);
5450 
5451   // Skip non-integer, non-scalar case
5452   EVT VT = N0.getValueType();
5453   if (VT.isVector() || VT != MVT::i32)
5454     return SDValue();
5455 
5456   // First try with the default operand order.
5457   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5458     return Result;
5459 
5460   // If that didn't work, try again with the operands commuted.
5461   return PerformADDCombineWithOperands(N, N1, N0, DCI);
5462 }
5463 
5464 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5465 ///
5466 static SDValue PerformFADDCombine(SDNode *N,
5467                                  TargetLowering::DAGCombinerInfo &DCI,
5468                                  CodeGenOptLevel OptLevel) {
5469   SDValue N0 = N->getOperand(0);
5470   SDValue N1 = N->getOperand(1);
5471 
5472   EVT VT = N0.getValueType();
5473   if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5474     return SDValue();
5475 
5476   // First try with the default operand order.
5477   if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5478     return Result;
5479 
5480   // If that didn't work, try again with the operands commuted.
5481   return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5482 }
5483 
5484 static SDValue PerformANDCombine(SDNode *N,
5485                                  TargetLowering::DAGCombinerInfo &DCI) {
5486   // The type legalizer turns a vector load of i8 values into a zextload to i16
5487   // registers, optionally ANY_EXTENDs it (if target type is integer),
5488   // and ANDs off the high 8 bits. Since we turn this load into a
5489   // target-specific DAG node, the DAG combiner fails to eliminate these AND
5490   // nodes. Do that here.
5491   SDValue Val = N->getOperand(0);
5492   SDValue Mask = N->getOperand(1);
5493 
5494   if (isa<ConstantSDNode>(Val)) {
5495     std::swap(Val, Mask);
5496   }
5497 
5498   SDValue AExt;
5499 
5500   // Convert BFE-> truncate i16 -> and 255
5501   // To just BFE-> truncate i16, as the value already has all the bits in the
5502   // right places.
5503   if (Val.getOpcode() == ISD::TRUNCATE) {
5504     SDValue BFE = Val.getOperand(0);
5505     if (BFE.getOpcode() != NVPTXISD::BFE)
5506       return SDValue();
5507 
5508     ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5509     if (!BFEBits)
5510       return SDValue();
5511     uint64_t BFEBitsVal = BFEBits->getZExtValue();
5512 
5513     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5514     if (!MaskCnst) {
5515       // Not an AND with a constant
5516       return SDValue();
5517     }
5518     uint64_t MaskVal = MaskCnst->getZExtValue();
5519 
5520     if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5521       return SDValue();
5522     // If we get here, the AND is unnecessary.  Just replace it with the trunc
5523     DCI.CombineTo(N, Val, false);
5524   }
5525   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5526   if (Val.getOpcode() == ISD::ANY_EXTEND) {
5527     AExt = Val;
5528     Val = Val->getOperand(0);
5529   }
5530 
5531   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5532     Val = Val->getOperand(0);
5533   }
5534 
5535   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5536       Val->getOpcode() == NVPTXISD::LoadV4) {
5537     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5538     if (!MaskCnst) {
5539       // Not an AND with a constant
5540       return SDValue();
5541     }
5542 
5543     uint64_t MaskVal = MaskCnst->getZExtValue();
5544     if (MaskVal != 0xff) {
5545       // Not an AND that chops off top 8 bits
5546       return SDValue();
5547     }
5548 
5549     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5550     if (!Mem) {
5551       // Not a MemSDNode?!?
5552       return SDValue();
5553     }
5554 
5555     EVT MemVT = Mem->getMemoryVT();
5556     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5557       // We only handle the i8 case
5558       return SDValue();
5559     }
5560 
5561     unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5562     if (ExtType == ISD::SEXTLOAD) {
5563       // If for some reason the load is a sextload, the and is needed to zero
5564       // out the high 8 bits
5565       return SDValue();
5566     }
5567 
5568     bool AddTo = false;
5569     if (AExt.getNode() != nullptr) {
5570       // Re-insert the ext as a zext.
5571       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5572                             AExt.getValueType(), Val);
5573       AddTo = true;
5574     }
5575 
5576     // If we get here, the AND is unnecessary.  Just replace it with the load
5577     DCI.CombineTo(N, Val, AddTo);
5578   }
5579 
5580   return SDValue();
5581 }
5582 
5583 static SDValue PerformREMCombine(SDNode *N,
5584                                  TargetLowering::DAGCombinerInfo &DCI,
5585                                  CodeGenOptLevel OptLevel) {
5586   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5587 
5588   // Don't do anything at less than -O2.
5589   if (OptLevel < CodeGenOptLevel::Default)
5590     return SDValue();
5591 
5592   SelectionDAG &DAG = DCI.DAG;
5593   SDLoc DL(N);
5594   EVT VT = N->getValueType(0);
5595   bool IsSigned = N->getOpcode() == ISD::SREM;
5596   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5597 
5598   const SDValue &Num = N->getOperand(0);
5599   const SDValue &Den = N->getOperand(1);
5600 
5601   for (const SDNode *U : Num->uses()) {
5602     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5603         U->getOperand(1) == Den) {
5604       // Num % Den -> Num - (Num / Den) * Den
5605       return DAG.getNode(ISD::SUB, DL, VT, Num,
5606                          DAG.getNode(ISD::MUL, DL, VT,
5607                                      DAG.getNode(DivOpc, DL, VT, Num, Den),
5608                                      Den));
5609     }
5610   }
5611   return SDValue();
5612 }
5613 
5614 enum OperandSignedness {
5615   Signed = 0,
5616   Unsigned,
5617   Unknown
5618 };
5619 
5620 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5621 /// that can be demoted to \p OptSize bits without loss of information. The
5622 /// signedness of the operand, if determinable, is placed in \p S.
5623 static bool IsMulWideOperandDemotable(SDValue Op,
5624                                       unsigned OptSize,
5625                                       OperandSignedness &S) {
5626   S = Unknown;
5627 
5628   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5629       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5630     EVT OrigVT = Op.getOperand(0).getValueType();
5631     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5632       S = Signed;
5633       return true;
5634     }
5635   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5636     EVT OrigVT = Op.getOperand(0).getValueType();
5637     if (OrigVT.getFixedSizeInBits() <= OptSize) {
5638       S = Unsigned;
5639       return true;
5640     }
5641   }
5642 
5643   return false;
5644 }
5645 
5646 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5647 /// be demoted to \p OptSize bits without loss of information. If the operands
5648 /// contain a constant, it should appear as the RHS operand. The signedness of
5649 /// the operands is placed in \p IsSigned.
5650 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5651                                         unsigned OptSize,
5652                                         bool &IsSigned) {
5653   OperandSignedness LHSSign;
5654 
5655   // The LHS operand must be a demotable op
5656   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5657     return false;
5658 
5659   // We should have been able to determine the signedness from the LHS
5660   if (LHSSign == Unknown)
5661     return false;
5662 
5663   IsSigned = (LHSSign == Signed);
5664 
5665   // The RHS can be a demotable op or a constant
5666   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5667     const APInt &Val = CI->getAPIntValue();
5668     if (LHSSign == Unsigned) {
5669       return Val.isIntN(OptSize);
5670     } else {
5671       return Val.isSignedIntN(OptSize);
5672     }
5673   } else {
5674     OperandSignedness RHSSign;
5675     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5676       return false;
5677 
5678     return LHSSign == RHSSign;
5679   }
5680 }
5681 
5682 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5683 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5684 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5685 /// amount.
5686 static SDValue TryMULWIDECombine(SDNode *N,
5687                                  TargetLowering::DAGCombinerInfo &DCI) {
5688   EVT MulType = N->getValueType(0);
5689   if (MulType != MVT::i32 && MulType != MVT::i64) {
5690     return SDValue();
5691   }
5692 
5693   SDLoc DL(N);
5694   unsigned OptSize = MulType.getSizeInBits() >> 1;
5695   SDValue LHS = N->getOperand(0);
5696   SDValue RHS = N->getOperand(1);
5697 
5698   // Canonicalize the multiply so the constant (if any) is on the right
5699   if (N->getOpcode() == ISD::MUL) {
5700     if (isa<ConstantSDNode>(LHS)) {
5701       std::swap(LHS, RHS);
5702     }
5703   }
5704 
5705   // If we have a SHL, determine the actual multiply amount
5706   if (N->getOpcode() == ISD::SHL) {
5707     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5708     if (!ShlRHS) {
5709       return SDValue();
5710     }
5711 
5712     APInt ShiftAmt = ShlRHS->getAPIntValue();
5713     unsigned BitWidth = MulType.getSizeInBits();
5714     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5715       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5716       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5717     } else {
5718       return SDValue();
5719     }
5720   }
5721 
5722   bool Signed;
5723   // Verify that our operands are demotable
5724   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5725     return SDValue();
5726   }
5727 
5728   EVT DemotedVT;
5729   if (MulType == MVT::i32) {
5730     DemotedVT = MVT::i16;
5731   } else {
5732     DemotedVT = MVT::i32;
5733   }
5734 
5735   // Truncate the operands to the correct size. Note that these are just for
5736   // type consistency and will (likely) be eliminated in later phases.
5737   SDValue TruncLHS =
5738     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5739   SDValue TruncRHS =
5740     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5741 
5742   unsigned Opc;
5743   if (Signed) {
5744     Opc = NVPTXISD::MUL_WIDE_SIGNED;
5745   } else {
5746     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5747   }
5748 
5749   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5750 }
5751 
5752 static bool isConstOne(const SDValue &Operand) {
5753   const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5754   return Const && Const->getZExtValue() == 1;
5755 }
5756 
5757 static SDValue matchMADConstOnePattern(SDValue Add) {
5758   if (Add->getOpcode() != ISD::ADD)
5759     return SDValue();
5760 
5761   if (isConstOne(Add->getOperand(0)))
5762     return Add->getOperand(1);
5763 
5764   if (isConstOne(Add->getOperand(1)))
5765     return Add->getOperand(0);
5766 
5767   return SDValue();
5768 }
5769 
5770 static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,
5771                                   TargetLowering::DAGCombinerInfo &DCI) {
5772 
5773   if (SDValue Y = matchMADConstOnePattern(Add))
5774     return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
5775 
5776   return SDValue();
5777 }
5778 
5779 static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,
5780                                         SDLoc DL,
5781                                         TargetLowering::DAGCombinerInfo &DCI) {
5782   if (Select->getOpcode() != ISD::SELECT)
5783     return SDValue();
5784 
5785   SDValue Cond = Select->getOperand(0);
5786 
5787   unsigned ConstOpNo;
5788   if (isConstOne(Select->getOperand(1)))
5789     ConstOpNo = 1;
5790   else if (isConstOne(Select->getOperand(2)))
5791     ConstOpNo = 2;
5792   else
5793     return SDValue();
5794 
5795   SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5796 
5797   // Do not combine if the resulting sequence is not obviously profitable.
5798   if (!matchMADConstOnePattern(Y))
5799     return SDValue();
5800 
5801   SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5802 
5803   return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5804                          (ConstOpNo == 1) ? X : NewMul,
5805                          (ConstOpNo == 1) ? NewMul : X);
5806 }
5807 
5808 static SDValue
5809 PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5810                               TargetLowering::DAGCombinerInfo &DCI) {
5811 
5812   EVT VT = N0.getValueType();
5813   if (VT.isVector())
5814     return SDValue();
5815 
5816   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5817     return SDValue();
5818 
5819   SDLoc DL(N);
5820 
5821   // (mul x, (add y, 1)) -> (mad x, y, x)
5822   if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5823     return Res;
5824   if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5825     return Res;
5826 
5827   // (mul x, (select y, 1)) -> (select (mul x, y), x)
5828   if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5829     return Res;
5830   if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5831     return Res;
5832 
5833   return SDValue();
5834 }
5835 
5836 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5837 static SDValue PerformMULCombine(SDNode *N,
5838                                  TargetLowering::DAGCombinerInfo &DCI,
5839                                  CodeGenOptLevel OptLevel) {
5840   if (OptLevel == CodeGenOptLevel::None)
5841     return SDValue();
5842 
5843   if (SDValue Ret = TryMULWIDECombine(N, DCI))
5844     return Ret;
5845 
5846   SDValue N0 = N->getOperand(0);
5847   SDValue N1 = N->getOperand(1);
5848   return PerformMULCombineWithOperands(N, N0, N1, DCI);
5849 }
5850 
5851 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5852 static SDValue PerformSHLCombine(SDNode *N,
5853                                  TargetLowering::DAGCombinerInfo &DCI,
5854                                  CodeGenOptLevel OptLevel) {
5855   if (OptLevel > CodeGenOptLevel::None) {
5856     // Try mul.wide combining at OptLevel > 0
5857     if (SDValue Ret = TryMULWIDECombine(N, DCI))
5858       return Ret;
5859   }
5860 
5861   return SDValue();
5862 }
5863 
5864 static SDValue PerformSETCCCombine(SDNode *N,
5865                                    TargetLowering::DAGCombinerInfo &DCI,
5866                                    unsigned int SmVersion) {
5867   EVT CCType = N->getValueType(0);
5868   SDValue A = N->getOperand(0);
5869   SDValue B = N->getOperand(1);
5870 
5871   EVT AType = A.getValueType();
5872   if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5873     return SDValue();
5874 
5875   if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5876     return SDValue();
5877 
5878   SDLoc DL(N);
5879   // setp.f16x2 returns two scalar predicates, which we need to
5880   // convert back to v2i1. The returned result will be scalarized by
5881   // the legalizer, but the comparison will remain a single vector
5882   // instruction.
5883   SDValue CCNode = DCI.DAG.getNode(
5884       A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5885                                      : NVPTXISD::SETP_BF16X2,
5886       DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5887   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5888                          CCNode.getValue(1));
5889 }
5890 
5891 static SDValue PerformEXTRACTCombine(SDNode *N,
5892                                      TargetLowering::DAGCombinerInfo &DCI) {
5893   SDValue Vector = N->getOperand(0);
5894   SDLoc DL(N);
5895   EVT VectorVT = Vector.getValueType();
5896   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5897       IsPTXVectorType(VectorVT.getSimpleVT()))
5898     return SDValue(); // Native vector loads already combine nicely w/
5899                       // extract_vector_elt.
5900   // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5901   // handle them OK.
5902   if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5903       VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5904     return SDValue();
5905 
5906   // Don't mess with undef values as sra may be simplified to 0, not undef.
5907   if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5908     return SDValue();
5909 
5910   uint64_t VectorBits = VectorVT.getSizeInBits();
5911   // We only handle the types we can extract in-register.
5912   if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5913     return SDValue();
5914 
5915   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5916   // Index == 0 is handled by generic DAG combiner.
5917   if (!Index || Index->getZExtValue() == 0)
5918     return SDValue();
5919 
5920   MVT IVT = MVT::getIntegerVT(VectorBits);
5921   EVT EltVT = VectorVT.getVectorElementType();
5922   EVT EltIVT = EltVT.changeTypeToInteger();
5923   uint64_t EltBits = EltVT.getScalarSizeInBits();
5924 
5925   SDValue Result = DCI.DAG.getNode(
5926       ISD::TRUNCATE, DL, EltIVT,
5927       DCI.DAG.getNode(
5928           ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5929           DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5930 
5931   // If element has non-integer type, bitcast it back to the expected type.
5932   if (EltVT != EltIVT)
5933     Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5934   // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5935   if (EltVT != N->getValueType(0))
5936     Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5937 
5938   return Result;
5939 }
5940 
5941 static SDValue PerformVSELECTCombine(SDNode *N,
5942                                      TargetLowering::DAGCombinerInfo &DCI) {
5943   SDValue VA = N->getOperand(1);
5944   EVT VectorVT = VA.getValueType();
5945   if (VectorVT != MVT::v4i8)
5946     return SDValue();
5947 
5948   // We need to split vselect into individual per-element operations Because we
5949   // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5950   // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5951   // to/from i16 normally used for i8 values.
5952   SmallVector<SDValue, 4> E;
5953   SDLoc DL(N);
5954   SDValue VCond = N->getOperand(0);
5955   SDValue VB = N->getOperand(2);
5956   for (int I = 0; I < 4; ++I) {
5957     SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5958                                 DCI.DAG.getConstant(I, DL, MVT::i32));
5959     SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5960         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5961                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5962         DL, MVT::i32);
5963     SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5964         DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5965                         DCI.DAG.getConstant(I, DL, MVT::i32)),
5966         DL, MVT::i32);
5967     E.push_back(DCI.DAG.getAnyExtOrTrunc(
5968         DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5969   }
5970   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5971 }
5972 
5973 static SDValue PerformLOADCombine(SDNode *N,
5974                                   TargetLowering::DAGCombinerInfo &DCI) {
5975   SelectionDAG &DAG = DCI.DAG;
5976   LoadSDNode *LD = cast<LoadSDNode>(N);
5977 
5978   // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5979   // letting ReplaceLoadVector split it into smaller loads during legalization.
5980   // This is done at dag-combine1 time, so that vector operations with i8
5981   // elements can be optimised away instead of being needlessly split during
5982   // legalization, which involves storing to the stack and loading it back.
5983   EVT VT = N->getValueType(0);
5984   if (VT != MVT::v16i8)
5985     return SDValue();
5986 
5987   SDLoc DL(N);
5988 
5989   // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5990   unsigned Opc = NVPTXISD::LoadV4;
5991   EVT NewVT = MVT::v4i32;
5992   EVT EltVT = NewVT.getVectorElementType();
5993   unsigned NumElts = NewVT.getVectorNumElements();
5994   EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5995   SDVTList RetVTList = DAG.getVTList(RetVTs);
5996   SmallVector<SDValue, 8> Ops(N->ops());
5997   Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5998   SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5999                                             LD->getMemOperand());
6000   SDValue NewChain = NewLoad.getValue(NumElts);
6001 
6002   // Create a vector of the same type returned by the original load.
6003   SmallVector<SDValue, 4> Elts;
6004   for (unsigned i = 0; i < NumElts; i++)
6005     Elts.push_back(NewLoad.getValue(i));
6006   return DCI.DAG.getMergeValues(
6007       {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
6008        NewChain},
6009       DL);
6010 }
6011 
6012 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6013                                                DAGCombinerInfo &DCI) const {
6014   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
6015   switch (N->getOpcode()) {
6016     default: break;
6017     case ISD::ADD:
6018       return PerformADDCombine(N, DCI, OptLevel);
6019     case ISD::FADD:
6020       return PerformFADDCombine(N, DCI, OptLevel);
6021     case ISD::MUL:
6022       return PerformMULCombine(N, DCI, OptLevel);
6023     case ISD::SHL:
6024       return PerformSHLCombine(N, DCI, OptLevel);
6025     case ISD::AND:
6026       return PerformANDCombine(N, DCI);
6027     case ISD::UREM:
6028     case ISD::SREM:
6029       return PerformREMCombine(N, DCI, OptLevel);
6030     case ISD::SETCC:
6031       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6032     case ISD::LOAD:
6033       return PerformLOADCombine(N, DCI);
6034     case NVPTXISD::StoreRetval:
6035     case NVPTXISD::StoreRetvalV2:
6036     case NVPTXISD::StoreRetvalV4:
6037       return PerformStoreRetvalCombine(N);
6038     case NVPTXISD::StoreParam:
6039     case NVPTXISD::StoreParamV2:
6040     case NVPTXISD::StoreParamV4:
6041       return PerformStoreParamCombine(N);
6042     case ISD::EXTRACT_VECTOR_ELT:
6043       return PerformEXTRACTCombine(N, DCI);
6044     case ISD::VSELECT:
6045       return PerformVSELECTCombine(N, DCI);
6046   }
6047   return SDValue();
6048 }
6049 
6050 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6051 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
6052                               SmallVectorImpl<SDValue> &Results) {
6053   EVT ResVT = N->getValueType(0);
6054   SDLoc DL(N);
6055 
6056   assert(ResVT.isVector() && "Vector load must have vector type");
6057 
6058   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
6059   // legal.  We can (and should) split that into 2 loads of <2 x double> here
6060   // but I'm leaving that as a TODO for now.
6061   assert(ResVT.isSimple() && "Can only handle simple types");
6062   switch (ResVT.getSimpleVT().SimpleTy) {
6063   default:
6064     return;
6065   case MVT::v2i8:
6066   case MVT::v2i16:
6067   case MVT::v2i32:
6068   case MVT::v2i64:
6069   case MVT::v2f16:
6070   case MVT::v2f32:
6071   case MVT::v2f64:
6072   case MVT::v4i8:
6073   case MVT::v4i16:
6074   case MVT::v4i32:
6075   case MVT::v4f16:
6076   case MVT::v4f32:
6077   case MVT::v8f16:  // <4 x f16x2>
6078   case MVT::v8bf16: // <4 x bf16x2>
6079   case MVT::v8i16:  // <4 x i16x2>
6080     // This is a "native" vector type
6081     break;
6082   }
6083 
6084   LoadSDNode *LD = cast<LoadSDNode>(N);
6085 
6086   Align Alignment = LD->getAlign();
6087   auto &TD = DAG.getDataLayout();
6088   Align PrefAlign =
6089       TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
6090   if (Alignment < PrefAlign) {
6091     // This load is not sufficiently aligned, so bail out and let this vector
6092     // load be scalarized.  Note that we may still be able to emit smaller
6093     // vector loads.  For example, if we are loading a <4 x float> with an
6094     // alignment of 8, this check will fail but the legalizer will try again
6095     // with 2 x <2 x float>, which will succeed with an alignment of 8.
6096     return;
6097   }
6098 
6099   EVT EltVT = ResVT.getVectorElementType();
6100   unsigned NumElts = ResVT.getVectorNumElements();
6101 
6102   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6103   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
6104   // loaded type to i16 and propagate the "real" type as the memory type.
6105   bool NeedTrunc = false;
6106   if (EltVT.getSizeInBits() < 16) {
6107     EltVT = MVT::i16;
6108     NeedTrunc = true;
6109   }
6110 
6111   unsigned Opcode = 0;
6112   SDVTList LdResVTs;
6113   bool Load16x2 = false;
6114 
6115   switch (NumElts) {
6116   default:
6117     return;
6118   case 2:
6119     Opcode = NVPTXISD::LoadV2;
6120     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6121     break;
6122   case 4: {
6123     Opcode = NVPTXISD::LoadV4;
6124     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6125     LdResVTs = DAG.getVTList(ListVTs);
6126     break;
6127   }
6128   case 8: {
6129     // v8f16 is a special case. PTX doesn't have ld.v8.f16
6130     // instruction. Instead, we split the vector into v2f16 chunks and
6131     // load them with ld.v4.b32.
6132     assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
6133     Load16x2 = true;
6134     Opcode = NVPTXISD::LoadV4;
6135     EVT VVT;
6136     switch (EltVT.getSimpleVT().SimpleTy) {
6137     case MVT::f16:
6138       VVT = MVT::v2f16;
6139       break;
6140     case MVT::bf16:
6141       VVT = MVT::v2bf16;
6142       break;
6143     case MVT::i16:
6144       VVT = MVT::v2i16;
6145       break;
6146     default:
6147       llvm_unreachable("Unsupported v8 vector type.");
6148     }
6149     EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
6150     LdResVTs = DAG.getVTList(ListVTs);
6151     break;
6152   }
6153   }
6154 
6155   // Copy regular operands
6156   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
6157 
6158   // The select routine does not have access to the LoadSDNode instance, so
6159   // pass along the extension information
6160   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6161 
6162   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6163                                           LD->getMemoryVT(),
6164                                           LD->getMemOperand());
6165 
6166   SmallVector<SDValue, 8> ScalarRes;
6167   if (Load16x2) {
6168     // Split v2f16 subvectors back into individual elements.
6169     NumElts /= 2;
6170     for (unsigned i = 0; i < NumElts; ++i) {
6171       SDValue SubVector = NewLD.getValue(i);
6172       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6173                                DAG.getIntPtrConstant(0, DL));
6174       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6175                                DAG.getIntPtrConstant(1, DL));
6176       ScalarRes.push_back(E0);
6177       ScalarRes.push_back(E1);
6178     }
6179   } else {
6180     for (unsigned i = 0; i < NumElts; ++i) {
6181       SDValue Res = NewLD.getValue(i);
6182       if (NeedTrunc)
6183         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6184       ScalarRes.push_back(Res);
6185     }
6186   }
6187 
6188   SDValue LoadChain = NewLD.getValue(NumElts);
6189 
6190   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
6191 
6192   Results.push_back(BuildVec);
6193   Results.push_back(LoadChain);
6194 }
6195 
6196 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
6197                                      SmallVectorImpl<SDValue> &Results) {
6198   SDValue Chain = N->getOperand(0);
6199   SDValue Intrin = N->getOperand(1);
6200   SDLoc DL(N);
6201 
6202   // Get the intrinsic ID
6203   unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6204   switch (IntrinNo) {
6205   default:
6206     return;
6207   case Intrinsic::nvvm_ldg_global_i:
6208   case Intrinsic::nvvm_ldg_global_f:
6209   case Intrinsic::nvvm_ldg_global_p:
6210   case Intrinsic::nvvm_ldu_global_i:
6211   case Intrinsic::nvvm_ldu_global_f:
6212   case Intrinsic::nvvm_ldu_global_p: {
6213     EVT ResVT = N->getValueType(0);
6214 
6215     if (ResVT.isVector()) {
6216       // Vector LDG/LDU
6217 
6218       unsigned NumElts = ResVT.getVectorNumElements();
6219       EVT EltVT = ResVT.getVectorElementType();
6220 
6221       // Since LDU/LDG are target nodes, we cannot rely on DAG type
6222       // legalization.
6223       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
6224       // loaded type to i16 and propagate the "real" type as the memory type.
6225       bool NeedTrunc = false;
6226       if (EltVT.getSizeInBits() < 16) {
6227         EltVT = MVT::i16;
6228         NeedTrunc = true;
6229       }
6230 
6231       unsigned Opcode = 0;
6232       SDVTList LdResVTs;
6233 
6234       switch (NumElts) {
6235       default:
6236         return;
6237       case 2:
6238         switch (IntrinNo) {
6239         default:
6240           return;
6241         case Intrinsic::nvvm_ldg_global_i:
6242         case Intrinsic::nvvm_ldg_global_f:
6243         case Intrinsic::nvvm_ldg_global_p:
6244           Opcode = NVPTXISD::LDGV2;
6245           break;
6246         case Intrinsic::nvvm_ldu_global_i:
6247         case Intrinsic::nvvm_ldu_global_f:
6248         case Intrinsic::nvvm_ldu_global_p:
6249           Opcode = NVPTXISD::LDUV2;
6250           break;
6251         }
6252         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6253         break;
6254       case 4: {
6255         switch (IntrinNo) {
6256         default:
6257           return;
6258         case Intrinsic::nvvm_ldg_global_i:
6259         case Intrinsic::nvvm_ldg_global_f:
6260         case Intrinsic::nvvm_ldg_global_p:
6261           Opcode = NVPTXISD::LDGV4;
6262           break;
6263         case Intrinsic::nvvm_ldu_global_i:
6264         case Intrinsic::nvvm_ldu_global_f:
6265         case Intrinsic::nvvm_ldu_global_p:
6266           Opcode = NVPTXISD::LDUV4;
6267           break;
6268         }
6269         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6270         LdResVTs = DAG.getVTList(ListVTs);
6271         break;
6272       }
6273       }
6274 
6275       SmallVector<SDValue, 8> OtherOps;
6276 
6277       // Copy regular operands
6278 
6279       OtherOps.push_back(Chain); // Chain
6280                                  // Skip operand 1 (intrinsic ID)
6281       // Others
6282       OtherOps.append(N->op_begin() + 2, N->op_end());
6283 
6284       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6285 
6286       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6287                                               MemSD->getMemoryVT(),
6288                                               MemSD->getMemOperand());
6289 
6290       SmallVector<SDValue, 4> ScalarRes;
6291 
6292       for (unsigned i = 0; i < NumElts; ++i) {
6293         SDValue Res = NewLD.getValue(i);
6294         if (NeedTrunc)
6295           Res =
6296               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6297         ScalarRes.push_back(Res);
6298       }
6299 
6300       SDValue LoadChain = NewLD.getValue(NumElts);
6301 
6302       SDValue BuildVec =
6303           DAG.getBuildVector(ResVT, DL, ScalarRes);
6304 
6305       Results.push_back(BuildVec);
6306       Results.push_back(LoadChain);
6307     } else {
6308       // i8 LDG/LDU
6309       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6310              "Custom handling of non-i8 ldu/ldg?");
6311 
6312       // Just copy all operands as-is
6313       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6314 
6315       // Force output to i16
6316       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6317 
6318       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6319 
6320       // We make sure the memory type is i8, which will be used during isel
6321       // to select the proper instruction.
6322       SDValue NewLD =
6323           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6324                                   MVT::i8, MemSD->getMemOperand());
6325 
6326       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6327                                     NewLD.getValue(0)));
6328       Results.push_back(NewLD.getValue(1));
6329     }
6330   }
6331   }
6332 }
6333 
6334 static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
6335                                    SmallVectorImpl<SDValue> &Results) {
6336   // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6337   // result so that it can pass the legalization
6338   SDLoc DL(N);
6339   SDValue Chain = N->getOperand(0);
6340   SDValue Reg = N->getOperand(1);
6341   SDValue Glue = N->getOperand(2);
6342 
6343   assert(Reg.getValueType() == MVT::i128 &&
6344          "Custom lowering for CopyFromReg with 128-bit reg only");
6345   SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6346                                      N->getValueType(2)};
6347   SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6348 
6349   SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6350   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6351                              {NewValue.getValue(0), NewValue.getValue(1)});
6352 
6353   Results.push_back(Pair);
6354   Results.push_back(NewValue.getValue(2));
6355   Results.push_back(NewValue.getValue(3));
6356 }
6357 
6358 void NVPTXTargetLowering::ReplaceNodeResults(
6359     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
6360   switch (N->getOpcode()) {
6361   default:
6362     report_fatal_error("Unhandled custom legalization");
6363   case ISD::LOAD:
6364     ReplaceLoadVector(N, DAG, Results);
6365     return;
6366   case ISD::INTRINSIC_W_CHAIN:
6367     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
6368     return;
6369   case ISD::CopyFromReg:
6370     ReplaceCopyFromReg_128(N, DAG, Results);
6371     return;
6372   }
6373 }
6374 
6375 NVPTXTargetLowering::AtomicExpansionKind
6376 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
6377   Type *Ty = AI->getValOperand()->getType();
6378 
6379   if (AI->isFloatingPointOperation()) {
6380     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
6381       if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6382           STI.getPTXVersion() >= 63)
6383         return AtomicExpansionKind::None;
6384       if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6385           STI.getPTXVersion() >= 78)
6386         return AtomicExpansionKind::None;
6387       if (Ty->isFloatTy())
6388         return AtomicExpansionKind::None;
6389       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6390         return AtomicExpansionKind::None;
6391     }
6392     return AtomicExpansionKind::CmpXChg;
6393   }
6394 
6395   assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6396   auto ITy = cast<llvm::IntegerType>(Ty);
6397 
6398   switch (AI->getOperation()) {
6399   default:
6400     return AtomicExpansionKind::CmpXChg;
6401   case AtomicRMWInst::BinOp::And:
6402   case AtomicRMWInst::BinOp::Or:
6403   case AtomicRMWInst::BinOp::Xor:
6404   case AtomicRMWInst::BinOp::Xchg:
6405     switch (ITy->getBitWidth()) {
6406     case 8:
6407     case 16:
6408       return AtomicExpansionKind::CmpXChg;
6409     case 32:
6410       return AtomicExpansionKind::None;
6411     case 64:
6412       if (STI.hasAtomBitwise64())
6413         return AtomicExpansionKind::None;
6414       return AtomicExpansionKind::CmpXChg;
6415     default:
6416       llvm_unreachable("unsupported width encountered");
6417     }
6418   case AtomicRMWInst::BinOp::Add:
6419   case AtomicRMWInst::BinOp::Sub:
6420   case AtomicRMWInst::BinOp::Max:
6421   case AtomicRMWInst::BinOp::Min:
6422   case AtomicRMWInst::BinOp::UMax:
6423   case AtomicRMWInst::BinOp::UMin:
6424     switch (ITy->getBitWidth()) {
6425     case 8:
6426     case 16:
6427       return AtomicExpansionKind::CmpXChg;
6428     case 32:
6429       return AtomicExpansionKind::None;
6430     case 64:
6431       if (STI.hasAtomMinMax64())
6432         return AtomicExpansionKind::None;
6433       return AtomicExpansionKind::CmpXChg;
6434     default:
6435       llvm_unreachable("unsupported width encountered");
6436     }
6437   }
6438 
6439   return AtomicExpansionKind::CmpXChg;
6440 }
6441 
6442 // Pin NVPTXTargetObjectFile's vtables to this file.
6443 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6444 
6445 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6446     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6447   return getDataSection();
6448 }
6449