1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/ISDOpcodes.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineMemOperand.h" 29 #include "llvm/CodeGen/MachineValueType.h" 30 #include "llvm/CodeGen/SelectionDAG.h" 31 #include "llvm/CodeGen/SelectionDAGNodes.h" 32 #include "llvm/CodeGen/TargetCallingConv.h" 33 #include "llvm/CodeGen/TargetLowering.h" 34 #include "llvm/CodeGen/ValueTypes.h" 35 #include "llvm/IR/Argument.h" 36 #include "llvm/IR/Attributes.h" 37 #include "llvm/IR/Constants.h" 38 #include "llvm/IR/DataLayout.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/DiagnosticInfo.h" 41 #include "llvm/IR/FPEnv.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalValue.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsNVPTX.h" 47 #include "llvm/IR/Module.h" 48 #include "llvm/IR/Type.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CodeGen.h" 52 #include "llvm/Support/CommandLine.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/raw_ostream.h" 55 #include "llvm/Target/TargetMachine.h" 56 #include "llvm/Target/TargetOptions.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <cmath> 60 #include <cstdint> 61 #include <iterator> 62 #include <sstream> 63 #include <string> 64 #include <utility> 65 #include <vector> 66 67 #define DEBUG_TYPE "nvptx-lower" 68 69 using namespace llvm; 70 71 static std::atomic<unsigned> GlobalUniqueCallSite; 72 73 static cl::opt<bool> sched4reg( 74 "nvptx-sched4reg", 75 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 76 77 static cl::opt<unsigned> FMAContractLevelOpt( 78 "nvptx-fma-level", cl::Hidden, 79 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 80 " 1: do it 2: do it aggressively"), 81 cl::init(2)); 82 83 static cl::opt<int> UsePrecDivF32( 84 "nvptx-prec-divf32", cl::Hidden, 85 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 86 " IEEE Compliant F32 div.rnd if available."), 87 cl::init(2)); 88 89 static cl::opt<bool> UsePrecSqrtF32( 90 "nvptx-prec-sqrtf32", cl::Hidden, 91 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 92 cl::init(true)); 93 94 static cl::opt<bool> ForceMinByValParamAlign( 95 "nvptx-force-min-byval-param-align", cl::Hidden, 96 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 97 " params of device functions."), 98 cl::init(false)); 99 100 int NVPTXTargetLowering::getDivF32Level() const { 101 if (UsePrecDivF32.getNumOccurrences() > 0) { 102 // If nvptx-prec-div32=N is used on the command-line, always honor it 103 return UsePrecDivF32; 104 } else { 105 // Otherwise, use div.approx if fast math is enabled 106 if (getTargetMachine().Options.UnsafeFPMath) 107 return 0; 108 else 109 return 2; 110 } 111 } 112 113 bool NVPTXTargetLowering::usePrecSqrtF32() const { 114 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 115 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 116 return UsePrecSqrtF32; 117 } else { 118 // Otherwise, use sqrt.approx if fast math is enabled 119 return !getTargetMachine().Options.UnsafeFPMath; 120 } 121 } 122 123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 124 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 125 DenormalMode::PreserveSign; 126 } 127 128 static bool IsPTXVectorType(MVT VT) { 129 switch (VT.SimpleTy) { 130 default: 131 return false; 132 case MVT::v2i1: 133 case MVT::v4i1: 134 case MVT::v2i8: 135 case MVT::v4i8: 136 case MVT::v2i16: 137 case MVT::v4i16: 138 case MVT::v8i16: // <4 x i16x2> 139 case MVT::v2i32: 140 case MVT::v4i32: 141 case MVT::v2i64: 142 case MVT::v2f16: 143 case MVT::v4f16: 144 case MVT::v8f16: // <4 x f16x2> 145 case MVT::v2bf16: 146 case MVT::v4bf16: 147 case MVT::v8bf16: // <4 x bf16x2> 148 case MVT::v2f32: 149 case MVT::v4f32: 150 case MVT::v2f64: 151 return true; 152 } 153 } 154 155 static bool Is16bitsType(MVT VT) { 156 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || 157 VT.SimpleTy == MVT::i16); 158 } 159 160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 161 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 162 /// into their primitive components. 163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 165 /// LowerCall, and LowerReturn. 166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 167 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 168 SmallVectorImpl<uint64_t> *Offsets = nullptr, 169 uint64_t StartingOffset = 0) { 170 SmallVector<EVT, 16> TempVTs; 171 SmallVector<uint64_t, 16> TempOffsets; 172 173 // Special case for i128 - decompose to (i64, i64) 174 if (Ty->isIntegerTy(128)) { 175 ValueVTs.push_back(EVT(MVT::i64)); 176 ValueVTs.push_back(EVT(MVT::i64)); 177 178 if (Offsets) { 179 Offsets->push_back(StartingOffset + 0); 180 Offsets->push_back(StartingOffset + 8); 181 } 182 183 return; 184 } 185 186 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 187 if (StructType *STy = dyn_cast<StructType>(Ty)) { 188 auto const *SL = DL.getStructLayout(STy); 189 auto ElementNum = 0; 190 for(auto *EI : STy->elements()) { 191 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 192 StartingOffset + SL->getElementOffset(ElementNum)); 193 ++ElementNum; 194 } 195 return; 196 } 197 198 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 199 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 200 EVT VT = TempVTs[i]; 201 uint64_t Off = TempOffsets[i]; 202 // Split vectors into individual elements, except for v2f16, which 203 // we will pass as a single scalar. 204 if (VT.isVector()) { 205 unsigned NumElts = VT.getVectorNumElements(); 206 EVT EltVT = VT.getVectorElementType(); 207 // Vectors with an even number of f16 elements will be passed to 208 // us as an array of v2f16/v2bf16 elements. We must match this so we 209 // stay in sync with Ins/Outs. 210 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 211 switch (EltVT.getSimpleVT().SimpleTy) { 212 case MVT::f16: 213 EltVT = MVT::v2f16; 214 break; 215 case MVT::bf16: 216 EltVT = MVT::v2bf16; 217 break; 218 case MVT::i16: 219 EltVT = MVT::v2i16; 220 break; 221 default: 222 llvm_unreachable("Unexpected type"); 223 } 224 NumElts /= 2; 225 } else if (EltVT.getSimpleVT() == MVT::i8 && 226 (NumElts % 4 == 0 || NumElts == 3)) { 227 // v*i8 are formally lowered as v4i8 228 EltVT = MVT::v4i8; 229 NumElts = (NumElts + 3) / 4; 230 } 231 for (unsigned j = 0; j != NumElts; ++j) { 232 ValueVTs.push_back(EltVT); 233 if (Offsets) 234 Offsets->push_back(Off + j * EltVT.getStoreSize()); 235 } 236 } else { 237 ValueVTs.push_back(VT); 238 if (Offsets) 239 Offsets->push_back(Off); 240 } 241 } 242 } 243 244 /// PromoteScalarIntegerPTX 245 /// Used to make sure the arguments/returns are suitable for passing 246 /// and promote them to a larger size if they're not. 247 /// 248 /// The promoted type is placed in \p PromoteVT if the function returns true. 249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 250 if (VT.isScalarInteger()) { 251 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 252 default: 253 llvm_unreachable( 254 "Promotion is not suitable for scalars of size larger than 64-bits"); 255 case 1: 256 *PromotedVT = MVT::i1; 257 break; 258 case 2: 259 case 4: 260 case 8: 261 *PromotedVT = MVT::i8; 262 break; 263 case 16: 264 *PromotedVT = MVT::i16; 265 break; 266 case 32: 267 *PromotedVT = MVT::i32; 268 break; 269 case 64: 270 *PromotedVT = MVT::i64; 271 break; 272 } 273 return EVT(*PromotedVT) != VT; 274 } 275 return false; 276 } 277 278 // Check whether we can merge loads/stores of some of the pieces of a 279 // flattened function parameter or return value into a single vector 280 // load/store. 281 // 282 // The flattened parameter is represented as a list of EVTs and 283 // offsets, and the whole structure is aligned to ParamAlignment. This 284 // function determines whether we can load/store pieces of the 285 // parameter starting at index Idx using a single vectorized op of 286 // size AccessSize. If so, it returns the number of param pieces 287 // covered by the vector op. Otherwise, it returns 1. 288 static unsigned CanMergeParamLoadStoresStartingAt( 289 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 290 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 291 292 // Can't vectorize if param alignment is not sufficient. 293 if (ParamAlignment < AccessSize) 294 return 1; 295 // Can't vectorize if offset is not aligned. 296 if (Offsets[Idx] & (AccessSize - 1)) 297 return 1; 298 299 EVT EltVT = ValueVTs[Idx]; 300 unsigned EltSize = EltVT.getStoreSize(); 301 302 // Element is too large to vectorize. 303 if (EltSize >= AccessSize) 304 return 1; 305 306 unsigned NumElts = AccessSize / EltSize; 307 // Can't vectorize if AccessBytes if not a multiple of EltSize. 308 if (AccessSize != EltSize * NumElts) 309 return 1; 310 311 // We don't have enough elements to vectorize. 312 if (Idx + NumElts > ValueVTs.size()) 313 return 1; 314 315 // PTX ISA can only deal with 2- and 4-element vector ops. 316 if (NumElts != 4 && NumElts != 2) 317 return 1; 318 319 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 320 // Types do not match. 321 if (ValueVTs[j] != EltVT) 322 return 1; 323 324 // Elements are not contiguous. 325 if (Offsets[j] - Offsets[j - 1] != EltSize) 326 return 1; 327 } 328 // OK. We can vectorize ValueVTs[i..i+NumElts) 329 return NumElts; 330 } 331 332 // Flags for tracking per-element vectorization state of loads/stores 333 // of a flattened function parameter or return value. 334 enum ParamVectorizationFlags { 335 PVF_INNER = 0x0, // Middle elements of a vector. 336 PVF_FIRST = 0x1, // First element of the vector. 337 PVF_LAST = 0x2, // Last element of the vector. 338 // Scalar is effectively a 1-element vector. 339 PVF_SCALAR = PVF_FIRST | PVF_LAST 340 }; 341 342 // Computes whether and how we can vectorize the loads/stores of a 343 // flattened function parameter or return value. 344 // 345 // The flattened parameter is represented as the list of ValueVTs and 346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 347 // of the same size as ValueVTs indicating how each piece should be 348 // loaded/stored (i.e. as a scalar, or as part of a vector 349 // load/store). 350 static SmallVector<ParamVectorizationFlags, 16> 351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 352 const SmallVectorImpl<uint64_t> &Offsets, 353 Align ParamAlignment, bool IsVAArg = false) { 354 // Set vector size to match ValueVTs and mark all elements as 355 // scalars by default. 356 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 357 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 358 359 if (IsVAArg) 360 return VectorInfo; 361 362 // Check what we can vectorize using 128/64/32-bit accesses. 363 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 364 // Skip elements we've already processed. 365 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 366 for (unsigned AccessSize : {16, 8, 4, 2}) { 367 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 368 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 369 // Mark vectorized elements. 370 switch (NumElts) { 371 default: 372 llvm_unreachable("Unexpected return value"); 373 case 1: 374 // Can't vectorize using this size, try next smaller size. 375 continue; 376 case 2: 377 assert(I + 1 < E && "Not enough elements."); 378 VectorInfo[I] = PVF_FIRST; 379 VectorInfo[I + 1] = PVF_LAST; 380 I += 1; 381 break; 382 case 4: 383 assert(I + 3 < E && "Not enough elements."); 384 VectorInfo[I] = PVF_FIRST; 385 VectorInfo[I + 1] = PVF_INNER; 386 VectorInfo[I + 2] = PVF_INNER; 387 VectorInfo[I + 3] = PVF_LAST; 388 I += 3; 389 break; 390 } 391 // Break out of the inner loop because we've already succeeded 392 // using largest possible AccessSize. 393 break; 394 } 395 } 396 return VectorInfo; 397 } 398 399 // NVPTXTargetLowering Constructor. 400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 401 const NVPTXSubtarget &STI) 402 : TargetLowering(TM), nvTM(&TM), STI(STI) { 403 // always lower memset, memcpy, and memmove intrinsics to load/store 404 // instructions, rather 405 // then generating calls to memset, mempcy or memmove. 406 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; 407 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; 408 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; 409 410 setBooleanContents(ZeroOrNegativeOneBooleanContent); 411 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 412 413 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 414 // condition branches. 415 setJumpIsExpensive(true); 416 417 // Wide divides are _very_ slow. Try to reduce the width of the divide if 418 // possible. 419 addBypassSlowDiv(64, 32); 420 421 // By default, use the Source scheduling 422 if (sched4reg) 423 setSchedulingPreference(Sched::RegPressure); 424 else 425 setSchedulingPreference(Sched::Source); 426 427 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 428 LegalizeAction NoF16Action) { 429 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 430 }; 431 432 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 433 LegalizeAction NoBF16Action) { 434 bool IsOpSupported = STI.hasBF16Math(); 435 // Few instructions are available on sm_90 only 436 switch(Op) { 437 case ISD::FADD: 438 case ISD::FMUL: 439 case ISD::FSUB: 440 case ISD::SELECT: 441 case ISD::SELECT_CC: 442 case ISD::SETCC: 443 case ISD::FEXP2: 444 case ISD::FCEIL: 445 case ISD::FFLOOR: 446 case ISD::FNEARBYINT: 447 case ISD::FRINT: 448 case ISD::FTRUNC: 449 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 450 break; 451 } 452 setOperationAction( 453 Op, VT, IsOpSupported ? Action : NoBF16Action); 454 }; 455 456 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 457 LegalizeAction NoI16x2Action) { 458 bool IsOpSupported = false; 459 // instructions are available on sm_90 only 460 switch (Op) { 461 case ISD::ADD: 462 case ISD::SMAX: 463 case ISD::SMIN: 464 case ISD::UMIN: 465 case ISD::UMAX: 466 case ISD::SUB: 467 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; 468 break; 469 } 470 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); 471 }; 472 473 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 474 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 475 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); 476 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); 477 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 478 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 479 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 480 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 481 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 482 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 483 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 484 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 485 486 // Conversion to/from FP16/FP16x2 is always legal. 487 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 489 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 490 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 491 492 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 493 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 494 495 // Conversion to/from BFP16/BFP16x2 is always legal. 496 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 497 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 498 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 500 501 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 502 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 503 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) 504 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); 505 506 // Conversion to/from i16/i16x2 is always legal. 507 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); 508 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 509 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); 510 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); 511 512 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); 513 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); 514 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); 515 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); 516 // Only logical ops can be done on v4i8 directly, others must be done 517 // elementwise. 518 setOperationAction( 519 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, 520 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, 521 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, 522 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, 523 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, 524 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, 525 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, 526 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, 527 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, 528 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, 529 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, 530 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, 531 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, 532 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, 533 ISD::USUBSAT}, 534 MVT::v4i8, Expand); 535 536 // Operations not directly supported by NVPTX. 537 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 538 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, 539 MVT::i32, MVT::i64}) { 540 setOperationAction(ISD::SELECT_CC, VT, Expand); 541 setOperationAction(ISD::BR_CC, VT, Expand); 542 } 543 544 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 545 // For others we will expand to a SHL/SRA pair. 546 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 547 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 548 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 549 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 550 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 551 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 552 553 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 554 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 555 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 556 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 557 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 558 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 559 560 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 561 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 562 563 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 564 // that don't have h/w rotation we lower them to multi-instruction assembly. 565 // See ROT*_sw in NVPTXIntrInfo.td 566 setOperationAction(ISD::ROTL, MVT::i64, Legal); 567 setOperationAction(ISD::ROTR, MVT::i64, Legal); 568 setOperationAction(ISD::ROTL, MVT::i32, Legal); 569 setOperationAction(ISD::ROTR, MVT::i32, Legal); 570 571 setOperationAction(ISD::ROTL, MVT::i16, Expand); 572 setOperationAction(ISD::ROTL, MVT::v2i16, Expand); 573 setOperationAction(ISD::ROTR, MVT::i16, Expand); 574 setOperationAction(ISD::ROTR, MVT::v2i16, Expand); 575 setOperationAction(ISD::ROTL, MVT::i8, Expand); 576 setOperationAction(ISD::ROTR, MVT::i8, Expand); 577 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 578 setOperationAction(ISD::BSWAP, MVT::v2i16, Expand); 579 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 580 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 581 582 // Indirect branch is not supported. 583 // This also disables Jump Table creation. 584 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 585 setOperationAction(ISD::BRIND, MVT::Other, Expand); 586 587 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 588 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 589 590 // We want to legalize constant related memmove and memcopy 591 // intrinsics. 592 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 593 594 // Turn FP extload into load/fpextend 595 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 596 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 597 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 598 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 600 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 601 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 602 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 605 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 606 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 607 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 610 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 611 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 612 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 614 // Turn FP truncstore into trunc + store. 615 // FIXME: vector types should also be expanded 616 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 617 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 618 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 619 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 620 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 621 622 // PTX does not support load / store predicate registers 623 setOperationAction(ISD::LOAD, MVT::i1, Custom); 624 setOperationAction(ISD::STORE, MVT::i1, Custom); 625 626 for (MVT VT : MVT::integer_valuetypes()) { 627 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 628 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 629 setTruncStoreAction(VT, MVT::i1, Expand); 630 } 631 632 // expand extload of vector of integers. 633 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, 634 MVT::v2i8, Expand); 635 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 636 637 // This is legal in NVPTX 638 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 639 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 640 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 641 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 642 643 // Lowering of DYNAMIC_STACKALLOC is unsupported. 644 // Custom lower to produce an error. 645 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 646 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 647 648 // TRAP can be lowered to PTX trap 649 setOperationAction(ISD::TRAP, MVT::Other, Legal); 650 651 // Register custom handling for vector loads/stores 652 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 653 if (IsPTXVectorType(VT)) { 654 setOperationAction(ISD::LOAD, VT, Custom); 655 setOperationAction(ISD::STORE, VT, Custom); 656 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 657 } 658 } 659 660 // Support varargs. 661 setOperationAction(ISD::VASTART, MVT::Other, Custom); 662 setOperationAction(ISD::VAARG, MVT::Other, Custom); 663 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 664 setOperationAction(ISD::VAEND, MVT::Other, Expand); 665 666 // Custom handling for i8 intrinsics 667 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 668 669 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 670 setOperationAction(ISD::ABS, Ty, Legal); 671 setOperationAction(ISD::SMIN, Ty, Legal); 672 setOperationAction(ISD::SMAX, Ty, Legal); 673 setOperationAction(ISD::UMIN, Ty, Legal); 674 setOperationAction(ISD::UMAX, Ty, Legal); 675 676 setOperationAction(ISD::CTPOP, Ty, Legal); 677 setOperationAction(ISD::CTLZ, Ty, Legal); 678 } 679 680 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); 681 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); 682 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); 683 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); 684 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); 685 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); 686 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); 687 688 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); 689 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); 690 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); 691 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); 692 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); 693 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); 694 695 // Other arithmetic and logic ops are unsupported. 696 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, 697 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, 698 ISD::SINT_TO_FP, ISD::UINT_TO_FP}, 699 MVT::v2i16, Expand); 700 701 setOperationAction(ISD::ADDC, MVT::i32, Legal); 702 setOperationAction(ISD::ADDE, MVT::i32, Legal); 703 setOperationAction(ISD::SUBC, MVT::i32, Legal); 704 setOperationAction(ISD::SUBE, MVT::i32, Legal); 705 if (STI.getPTXVersion() >= 43) { 706 setOperationAction(ISD::ADDC, MVT::i64, Legal); 707 setOperationAction(ISD::ADDE, MVT::i64, Legal); 708 setOperationAction(ISD::SUBC, MVT::i64, Legal); 709 setOperationAction(ISD::SUBE, MVT::i64, Legal); 710 } 711 712 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 713 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); 714 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 715 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 716 717 // PTX does not directly support SELP of i1, so promote to i32 first 718 setOperationAction(ISD::SELECT, MVT::i1, Custom); 719 720 // PTX cannot multiply two i64s in a single instruction. 721 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 722 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 723 724 // We have some custom DAG combine patterns for these nodes 725 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, 726 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, 727 ISD::VSELECT}); 728 729 // setcc for f16x2 and bf16x2 needs special handling to prevent 730 // legalizer's attempt to scalarize it due to v2i1 not being legal. 731 if (STI.allowFP16Math() || STI.hasBF16Math()) 732 setTargetDAGCombine(ISD::SETCC); 733 734 // Promote fp16 arithmetic if fp16 hardware isn't available or the 735 // user passed --nvptx-no-fp16-math. The flag is useful because, 736 // although sm_53+ GPUs have some sort of FP16 support in 737 // hardware, only sm_53 and sm_60 have full implementation. Others 738 // only have token amount of hardware and are likely to run faster 739 // by using fp32 units instead. 740 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 741 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 742 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 743 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 744 // bf16 must be promoted to f32. 745 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 746 if (getOperationAction(Op, MVT::bf16) == Promote) 747 AddPromotedToType(Op, MVT::bf16, MVT::f32); 748 } 749 750 // f16/f16x2 neg was introduced in PTX 60, SM_53. 751 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 752 STI.getPTXVersion() >= 60 && 753 STI.allowFP16Math(); 754 for (const auto &VT : {MVT::f16, MVT::v2f16}) 755 setOperationAction(ISD::FNEG, VT, 756 IsFP16FP16x2NegAvailable ? Legal : Expand); 757 758 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 759 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 760 // (would be) Library functions. 761 762 // These map to conversion instructions for scalar FP types. 763 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 764 ISD::FROUNDEVEN, ISD::FTRUNC}) { 765 setOperationAction(Op, MVT::f16, Legal); 766 setOperationAction(Op, MVT::f32, Legal); 767 setOperationAction(Op, MVT::f64, Legal); 768 setOperationAction(Op, MVT::v2f16, Expand); 769 setOperationAction(Op, MVT::v2bf16, Expand); 770 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 771 if (getOperationAction(Op, MVT::bf16) == Promote) 772 AddPromotedToType(Op, MVT::bf16, MVT::f32); 773 } 774 775 // sm_80 only has conversions between f32 and bf16. Custom lower all other 776 // bf16 conversions. 777 if (STI.hasBF16Math() && 778 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { 779 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { 780 setOperationAction( 781 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 782 VT, Custom); 783 } 784 } 785 786 setOperationAction(ISD::FROUND, MVT::f16, Promote); 787 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 788 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 789 setOperationAction(ISD::FROUND, MVT::f32, Custom); 790 setOperationAction(ISD::FROUND, MVT::f64, Custom); 791 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 792 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); 793 794 // 'Expand' implements FCOPYSIGN without calling an external library. 795 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 796 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 797 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 798 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 799 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 800 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 801 802 // These map to corresponding instructions for f32/f64. f16 must be 803 // promoted to f32. v2f16 is expanded to f16, which is then promoted 804 // to f32. 805 for (const auto &Op : 806 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { 807 setOperationAction(Op, MVT::f16, Promote); 808 setOperationAction(Op, MVT::f32, Legal); 809 setOperationAction(Op, MVT::f64, Legal); 810 setOperationAction(Op, MVT::v2f16, Expand); 811 setOperationAction(Op, MVT::v2bf16, Expand); 812 setOperationAction(Op, MVT::bf16, Promote); 813 AddPromotedToType(Op, MVT::bf16, MVT::f32); 814 } 815 for (const auto &Op : {ISD::FABS}) { 816 setOperationAction(Op, MVT::f16, Promote); 817 setOperationAction(Op, MVT::f32, Legal); 818 setOperationAction(Op, MVT::f64, Legal); 819 setOperationAction(Op, MVT::v2f16, Expand); 820 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 821 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 822 if (getOperationAction(Op, MVT::bf16) == Promote) 823 AddPromotedToType(Op, MVT::bf16, MVT::f32); 824 } 825 826 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 827 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 828 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 829 return IsAtLeastSm80 ? Legal : NotSm80Action; 830 }; 831 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 832 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 833 setOperationAction(Op, MVT::f32, Legal); 834 setOperationAction(Op, MVT::f64, Legal); 835 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 836 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 837 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 838 if (getOperationAction(Op, MVT::bf16) == Promote) 839 AddPromotedToType(Op, MVT::bf16, MVT::f32); 840 } 841 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 842 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 843 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 844 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 845 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 846 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 847 } 848 849 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 850 // No FPOW or FREM in PTX. 851 852 // Now deduce the information based on the above mentioned 853 // actions 854 computeRegisterProperties(STI.getRegisterInfo()); 855 856 setMinCmpXchgSizeInBits(32); 857 setMaxAtomicSizeInBitsSupported(64); 858 } 859 860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 861 switch ((NVPTXISD::NodeType)Opcode) { 862 case NVPTXISD::FIRST_NUMBER: 863 break; 864 case NVPTXISD::CALL: 865 return "NVPTXISD::CALL"; 866 case NVPTXISD::RET_GLUE: 867 return "NVPTXISD::RET_GLUE"; 868 case NVPTXISD::LOAD_PARAM: 869 return "NVPTXISD::LOAD_PARAM"; 870 case NVPTXISD::Wrapper: 871 return "NVPTXISD::Wrapper"; 872 case NVPTXISD::DeclareParam: 873 return "NVPTXISD::DeclareParam"; 874 case NVPTXISD::DeclareScalarParam: 875 return "NVPTXISD::DeclareScalarParam"; 876 case NVPTXISD::DeclareRet: 877 return "NVPTXISD::DeclareRet"; 878 case NVPTXISD::DeclareScalarRet: 879 return "NVPTXISD::DeclareScalarRet"; 880 case NVPTXISD::DeclareRetParam: 881 return "NVPTXISD::DeclareRetParam"; 882 case NVPTXISD::PrintCall: 883 return "NVPTXISD::PrintCall"; 884 case NVPTXISD::PrintConvergentCall: 885 return "NVPTXISD::PrintConvergentCall"; 886 case NVPTXISD::PrintCallUni: 887 return "NVPTXISD::PrintCallUni"; 888 case NVPTXISD::PrintConvergentCallUni: 889 return "NVPTXISD::PrintConvergentCallUni"; 890 case NVPTXISD::LoadParam: 891 return "NVPTXISD::LoadParam"; 892 case NVPTXISD::LoadParamV2: 893 return "NVPTXISD::LoadParamV2"; 894 case NVPTXISD::LoadParamV4: 895 return "NVPTXISD::LoadParamV4"; 896 case NVPTXISD::StoreParam: 897 return "NVPTXISD::StoreParam"; 898 case NVPTXISD::StoreParamV2: 899 return "NVPTXISD::StoreParamV2"; 900 case NVPTXISD::StoreParamV4: 901 return "NVPTXISD::StoreParamV4"; 902 case NVPTXISD::StoreParamS32: 903 return "NVPTXISD::StoreParamS32"; 904 case NVPTXISD::StoreParamU32: 905 return "NVPTXISD::StoreParamU32"; 906 case NVPTXISD::CallArgBegin: 907 return "NVPTXISD::CallArgBegin"; 908 case NVPTXISD::CallArg: 909 return "NVPTXISD::CallArg"; 910 case NVPTXISD::LastCallArg: 911 return "NVPTXISD::LastCallArg"; 912 case NVPTXISD::CallArgEnd: 913 return "NVPTXISD::CallArgEnd"; 914 case NVPTXISD::CallVoid: 915 return "NVPTXISD::CallVoid"; 916 case NVPTXISD::CallVal: 917 return "NVPTXISD::CallVal"; 918 case NVPTXISD::CallSymbol: 919 return "NVPTXISD::CallSymbol"; 920 case NVPTXISD::Prototype: 921 return "NVPTXISD::Prototype"; 922 case NVPTXISD::MoveParam: 923 return "NVPTXISD::MoveParam"; 924 case NVPTXISD::StoreRetval: 925 return "NVPTXISD::StoreRetval"; 926 case NVPTXISD::StoreRetvalV2: 927 return "NVPTXISD::StoreRetvalV2"; 928 case NVPTXISD::StoreRetvalV4: 929 return "NVPTXISD::StoreRetvalV4"; 930 case NVPTXISD::PseudoUseParam: 931 return "NVPTXISD::PseudoUseParam"; 932 case NVPTXISD::RETURN: 933 return "NVPTXISD::RETURN"; 934 case NVPTXISD::CallSeqBegin: 935 return "NVPTXISD::CallSeqBegin"; 936 case NVPTXISD::CallSeqEnd: 937 return "NVPTXISD::CallSeqEnd"; 938 case NVPTXISD::CallPrototype: 939 return "NVPTXISD::CallPrototype"; 940 case NVPTXISD::ProxyReg: 941 return "NVPTXISD::ProxyReg"; 942 case NVPTXISD::LoadV2: 943 return "NVPTXISD::LoadV2"; 944 case NVPTXISD::LoadV4: 945 return "NVPTXISD::LoadV4"; 946 case NVPTXISD::LDGV2: 947 return "NVPTXISD::LDGV2"; 948 case NVPTXISD::LDGV4: 949 return "NVPTXISD::LDGV4"; 950 case NVPTXISD::LDUV2: 951 return "NVPTXISD::LDUV2"; 952 case NVPTXISD::LDUV4: 953 return "NVPTXISD::LDUV4"; 954 case NVPTXISD::StoreV2: 955 return "NVPTXISD::StoreV2"; 956 case NVPTXISD::StoreV4: 957 return "NVPTXISD::StoreV4"; 958 case NVPTXISD::FUN_SHFL_CLAMP: 959 return "NVPTXISD::FUN_SHFL_CLAMP"; 960 case NVPTXISD::FUN_SHFR_CLAMP: 961 return "NVPTXISD::FUN_SHFR_CLAMP"; 962 case NVPTXISD::IMAD: 963 return "NVPTXISD::IMAD"; 964 case NVPTXISD::BFE: 965 return "NVPTXISD::BFE"; 966 case NVPTXISD::BFI: 967 return "NVPTXISD::BFI"; 968 case NVPTXISD::PRMT: 969 return "NVPTXISD::PRMT"; 970 case NVPTXISD::SETP_F16X2: 971 return "NVPTXISD::SETP_F16X2"; 972 case NVPTXISD::SETP_BF16X2: 973 return "NVPTXISD::SETP_BF16X2"; 974 case NVPTXISD::Dummy: 975 return "NVPTXISD::Dummy"; 976 case NVPTXISD::MUL_WIDE_SIGNED: 977 return "NVPTXISD::MUL_WIDE_SIGNED"; 978 case NVPTXISD::MUL_WIDE_UNSIGNED: 979 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 980 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 981 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 982 case NVPTXISD::Tex1DFloatFloatLevel: 983 return "NVPTXISD::Tex1DFloatFloatLevel"; 984 case NVPTXISD::Tex1DFloatFloatGrad: 985 return "NVPTXISD::Tex1DFloatFloatGrad"; 986 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 987 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 988 case NVPTXISD::Tex1DS32FloatLevel: 989 return "NVPTXISD::Tex1DS32FloatLevel"; 990 case NVPTXISD::Tex1DS32FloatGrad: 991 return "NVPTXISD::Tex1DS32FloatGrad"; 992 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 993 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 994 case NVPTXISD::Tex1DU32FloatLevel: 995 return "NVPTXISD::Tex1DU32FloatLevel"; 996 case NVPTXISD::Tex1DU32FloatGrad: 997 return "NVPTXISD::Tex1DU32FloatGrad"; 998 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 999 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 1000 case NVPTXISD::Tex1DArrayFloatFloatLevel: 1001 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 1002 case NVPTXISD::Tex1DArrayFloatFloatGrad: 1003 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 1004 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 1005 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 1006 case NVPTXISD::Tex1DArrayS32FloatLevel: 1007 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 1008 case NVPTXISD::Tex1DArrayS32FloatGrad: 1009 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 1010 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 1011 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 1012 case NVPTXISD::Tex1DArrayU32FloatLevel: 1013 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 1014 case NVPTXISD::Tex1DArrayU32FloatGrad: 1015 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 1016 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 1017 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 1018 case NVPTXISD::Tex2DFloatFloatLevel: 1019 return "NVPTXISD::Tex2DFloatFloatLevel"; 1020 case NVPTXISD::Tex2DFloatFloatGrad: 1021 return "NVPTXISD::Tex2DFloatFloatGrad"; 1022 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 1023 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 1024 case NVPTXISD::Tex2DS32FloatLevel: 1025 return "NVPTXISD::Tex2DS32FloatLevel"; 1026 case NVPTXISD::Tex2DS32FloatGrad: 1027 return "NVPTXISD::Tex2DS32FloatGrad"; 1028 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 1029 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 1030 case NVPTXISD::Tex2DU32FloatLevel: 1031 return "NVPTXISD::Tex2DU32FloatLevel"; 1032 case NVPTXISD::Tex2DU32FloatGrad: 1033 return "NVPTXISD::Tex2DU32FloatGrad"; 1034 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 1035 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 1036 case NVPTXISD::Tex2DArrayFloatFloatLevel: 1037 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 1038 case NVPTXISD::Tex2DArrayFloatFloatGrad: 1039 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 1040 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 1041 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 1042 case NVPTXISD::Tex2DArrayS32FloatLevel: 1043 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 1044 case NVPTXISD::Tex2DArrayS32FloatGrad: 1045 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 1046 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 1047 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 1048 case NVPTXISD::Tex2DArrayU32FloatLevel: 1049 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 1050 case NVPTXISD::Tex2DArrayU32FloatGrad: 1051 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 1052 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 1053 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 1054 case NVPTXISD::Tex3DFloatFloatLevel: 1055 return "NVPTXISD::Tex3DFloatFloatLevel"; 1056 case NVPTXISD::Tex3DFloatFloatGrad: 1057 return "NVPTXISD::Tex3DFloatFloatGrad"; 1058 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 1059 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 1060 case NVPTXISD::Tex3DS32FloatLevel: 1061 return "NVPTXISD::Tex3DS32FloatLevel"; 1062 case NVPTXISD::Tex3DS32FloatGrad: 1063 return "NVPTXISD::Tex3DS32FloatGrad"; 1064 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 1065 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 1066 case NVPTXISD::Tex3DU32FloatLevel: 1067 return "NVPTXISD::Tex3DU32FloatLevel"; 1068 case NVPTXISD::Tex3DU32FloatGrad: 1069 return "NVPTXISD::Tex3DU32FloatGrad"; 1070 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 1071 case NVPTXISD::TexCubeFloatFloatLevel: 1072 return "NVPTXISD::TexCubeFloatFloatLevel"; 1073 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 1074 case NVPTXISD::TexCubeS32FloatLevel: 1075 return "NVPTXISD::TexCubeS32FloatLevel"; 1076 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 1077 case NVPTXISD::TexCubeU32FloatLevel: 1078 return "NVPTXISD::TexCubeU32FloatLevel"; 1079 case NVPTXISD::TexCubeArrayFloatFloat: 1080 return "NVPTXISD::TexCubeArrayFloatFloat"; 1081 case NVPTXISD::TexCubeArrayFloatFloatLevel: 1082 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 1083 case NVPTXISD::TexCubeArrayS32Float: 1084 return "NVPTXISD::TexCubeArrayS32Float"; 1085 case NVPTXISD::TexCubeArrayS32FloatLevel: 1086 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 1087 case NVPTXISD::TexCubeArrayU32Float: 1088 return "NVPTXISD::TexCubeArrayU32Float"; 1089 case NVPTXISD::TexCubeArrayU32FloatLevel: 1090 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 1091 case NVPTXISD::Tld4R2DFloatFloat: 1092 return "NVPTXISD::Tld4R2DFloatFloat"; 1093 case NVPTXISD::Tld4G2DFloatFloat: 1094 return "NVPTXISD::Tld4G2DFloatFloat"; 1095 case NVPTXISD::Tld4B2DFloatFloat: 1096 return "NVPTXISD::Tld4B2DFloatFloat"; 1097 case NVPTXISD::Tld4A2DFloatFloat: 1098 return "NVPTXISD::Tld4A2DFloatFloat"; 1099 case NVPTXISD::Tld4R2DS64Float: 1100 return "NVPTXISD::Tld4R2DS64Float"; 1101 case NVPTXISD::Tld4G2DS64Float: 1102 return "NVPTXISD::Tld4G2DS64Float"; 1103 case NVPTXISD::Tld4B2DS64Float: 1104 return "NVPTXISD::Tld4B2DS64Float"; 1105 case NVPTXISD::Tld4A2DS64Float: 1106 return "NVPTXISD::Tld4A2DS64Float"; 1107 case NVPTXISD::Tld4R2DU64Float: 1108 return "NVPTXISD::Tld4R2DU64Float"; 1109 case NVPTXISD::Tld4G2DU64Float: 1110 return "NVPTXISD::Tld4G2DU64Float"; 1111 case NVPTXISD::Tld4B2DU64Float: 1112 return "NVPTXISD::Tld4B2DU64Float"; 1113 case NVPTXISD::Tld4A2DU64Float: 1114 return "NVPTXISD::Tld4A2DU64Float"; 1115 1116 case NVPTXISD::TexUnified1DFloatS32: 1117 return "NVPTXISD::TexUnified1DFloatS32"; 1118 case NVPTXISD::TexUnified1DFloatFloat: 1119 return "NVPTXISD::TexUnified1DFloatFloat"; 1120 case NVPTXISD::TexUnified1DFloatFloatLevel: 1121 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 1122 case NVPTXISD::TexUnified1DFloatFloatGrad: 1123 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 1124 case NVPTXISD::TexUnified1DS32S32: 1125 return "NVPTXISD::TexUnified1DS32S32"; 1126 case NVPTXISD::TexUnified1DS32Float: 1127 return "NVPTXISD::TexUnified1DS32Float"; 1128 case NVPTXISD::TexUnified1DS32FloatLevel: 1129 return "NVPTXISD::TexUnified1DS32FloatLevel"; 1130 case NVPTXISD::TexUnified1DS32FloatGrad: 1131 return "NVPTXISD::TexUnified1DS32FloatGrad"; 1132 case NVPTXISD::TexUnified1DU32S32: 1133 return "NVPTXISD::TexUnified1DU32S32"; 1134 case NVPTXISD::TexUnified1DU32Float: 1135 return "NVPTXISD::TexUnified1DU32Float"; 1136 case NVPTXISD::TexUnified1DU32FloatLevel: 1137 return "NVPTXISD::TexUnified1DU32FloatLevel"; 1138 case NVPTXISD::TexUnified1DU32FloatGrad: 1139 return "NVPTXISD::TexUnified1DU32FloatGrad"; 1140 case NVPTXISD::TexUnified1DArrayFloatS32: 1141 return "NVPTXISD::TexUnified1DArrayFloatS32"; 1142 case NVPTXISD::TexUnified1DArrayFloatFloat: 1143 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 1144 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 1145 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 1146 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 1147 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 1148 case NVPTXISD::TexUnified1DArrayS32S32: 1149 return "NVPTXISD::TexUnified1DArrayS32S32"; 1150 case NVPTXISD::TexUnified1DArrayS32Float: 1151 return "NVPTXISD::TexUnified1DArrayS32Float"; 1152 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 1153 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 1154 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 1155 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 1156 case NVPTXISD::TexUnified1DArrayU32S32: 1157 return "NVPTXISD::TexUnified1DArrayU32S32"; 1158 case NVPTXISD::TexUnified1DArrayU32Float: 1159 return "NVPTXISD::TexUnified1DArrayU32Float"; 1160 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 1161 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 1162 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 1163 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 1164 case NVPTXISD::TexUnified2DFloatS32: 1165 return "NVPTXISD::TexUnified2DFloatS32"; 1166 case NVPTXISD::TexUnified2DFloatFloat: 1167 return "NVPTXISD::TexUnified2DFloatFloat"; 1168 case NVPTXISD::TexUnified2DFloatFloatLevel: 1169 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 1170 case NVPTXISD::TexUnified2DFloatFloatGrad: 1171 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 1172 case NVPTXISD::TexUnified2DS32S32: 1173 return "NVPTXISD::TexUnified2DS32S32"; 1174 case NVPTXISD::TexUnified2DS32Float: 1175 return "NVPTXISD::TexUnified2DS32Float"; 1176 case NVPTXISD::TexUnified2DS32FloatLevel: 1177 return "NVPTXISD::TexUnified2DS32FloatLevel"; 1178 case NVPTXISD::TexUnified2DS32FloatGrad: 1179 return "NVPTXISD::TexUnified2DS32FloatGrad"; 1180 case NVPTXISD::TexUnified2DU32S32: 1181 return "NVPTXISD::TexUnified2DU32S32"; 1182 case NVPTXISD::TexUnified2DU32Float: 1183 return "NVPTXISD::TexUnified2DU32Float"; 1184 case NVPTXISD::TexUnified2DU32FloatLevel: 1185 return "NVPTXISD::TexUnified2DU32FloatLevel"; 1186 case NVPTXISD::TexUnified2DU32FloatGrad: 1187 return "NVPTXISD::TexUnified2DU32FloatGrad"; 1188 case NVPTXISD::TexUnified2DArrayFloatS32: 1189 return "NVPTXISD::TexUnified2DArrayFloatS32"; 1190 case NVPTXISD::TexUnified2DArrayFloatFloat: 1191 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 1192 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 1193 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 1194 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 1195 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 1196 case NVPTXISD::TexUnified2DArrayS32S32: 1197 return "NVPTXISD::TexUnified2DArrayS32S32"; 1198 case NVPTXISD::TexUnified2DArrayS32Float: 1199 return "NVPTXISD::TexUnified2DArrayS32Float"; 1200 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 1201 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 1202 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 1203 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 1204 case NVPTXISD::TexUnified2DArrayU32S32: 1205 return "NVPTXISD::TexUnified2DArrayU32S32"; 1206 case NVPTXISD::TexUnified2DArrayU32Float: 1207 return "NVPTXISD::TexUnified2DArrayU32Float"; 1208 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 1209 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 1210 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 1211 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 1212 case NVPTXISD::TexUnified3DFloatS32: 1213 return "NVPTXISD::TexUnified3DFloatS32"; 1214 case NVPTXISD::TexUnified3DFloatFloat: 1215 return "NVPTXISD::TexUnified3DFloatFloat"; 1216 case NVPTXISD::TexUnified3DFloatFloatLevel: 1217 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 1218 case NVPTXISD::TexUnified3DFloatFloatGrad: 1219 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 1220 case NVPTXISD::TexUnified3DS32S32: 1221 return "NVPTXISD::TexUnified3DS32S32"; 1222 case NVPTXISD::TexUnified3DS32Float: 1223 return "NVPTXISD::TexUnified3DS32Float"; 1224 case NVPTXISD::TexUnified3DS32FloatLevel: 1225 return "NVPTXISD::TexUnified3DS32FloatLevel"; 1226 case NVPTXISD::TexUnified3DS32FloatGrad: 1227 return "NVPTXISD::TexUnified3DS32FloatGrad"; 1228 case NVPTXISD::TexUnified3DU32S32: 1229 return "NVPTXISD::TexUnified3DU32S32"; 1230 case NVPTXISD::TexUnified3DU32Float: 1231 return "NVPTXISD::TexUnified3DU32Float"; 1232 case NVPTXISD::TexUnified3DU32FloatLevel: 1233 return "NVPTXISD::TexUnified3DU32FloatLevel"; 1234 case NVPTXISD::TexUnified3DU32FloatGrad: 1235 return "NVPTXISD::TexUnified3DU32FloatGrad"; 1236 case NVPTXISD::TexUnifiedCubeFloatFloat: 1237 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1238 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1239 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1240 case NVPTXISD::TexUnifiedCubeS32Float: 1241 return "NVPTXISD::TexUnifiedCubeS32Float"; 1242 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1243 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1244 case NVPTXISD::TexUnifiedCubeU32Float: 1245 return "NVPTXISD::TexUnifiedCubeU32Float"; 1246 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1247 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1248 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1249 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1250 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1251 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1252 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1253 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1254 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1255 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1256 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1257 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1258 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1259 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1260 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1261 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1262 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1263 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1264 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1265 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1266 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1267 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1268 case NVPTXISD::Tld4UnifiedR2DS64Float: 1269 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1270 case NVPTXISD::Tld4UnifiedG2DS64Float: 1271 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1272 case NVPTXISD::Tld4UnifiedB2DS64Float: 1273 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1274 case NVPTXISD::Tld4UnifiedA2DS64Float: 1275 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1276 case NVPTXISD::Tld4UnifiedR2DU64Float: 1277 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1278 case NVPTXISD::Tld4UnifiedG2DU64Float: 1279 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1280 case NVPTXISD::Tld4UnifiedB2DU64Float: 1281 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1282 case NVPTXISD::Tld4UnifiedA2DU64Float: 1283 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1284 1285 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1286 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1287 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1288 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1289 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1290 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1291 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1292 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1293 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1294 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1295 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1296 1297 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1298 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1299 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1300 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1301 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1302 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1303 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1304 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1305 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1306 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1307 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1308 1309 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1310 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1311 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1312 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1313 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1314 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1315 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1316 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1317 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1318 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1319 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1320 1321 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1322 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1323 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1324 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1325 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1326 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1327 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1328 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1329 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1330 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1331 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1332 1333 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1334 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1335 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1336 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1337 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1338 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1339 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1340 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1341 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1342 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1343 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1344 1345 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1346 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1347 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1348 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1349 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1350 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1351 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1352 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1353 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1354 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1355 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1356 1357 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1358 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1359 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1360 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1361 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1362 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1363 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1364 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1365 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1366 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1367 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1368 1369 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1370 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1371 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1372 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1373 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1374 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1375 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1376 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1377 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1378 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1379 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1380 1381 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1382 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1383 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1384 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1385 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1386 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1387 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1388 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1389 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1390 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1391 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1392 1393 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1394 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1395 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1396 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1397 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1398 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1399 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1400 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1401 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1402 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1403 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1404 1405 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1406 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1407 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1408 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1409 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1410 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1411 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1412 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1413 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1414 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1415 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1416 1417 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1418 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1419 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1420 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1421 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1422 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1423 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1424 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1425 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1426 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1427 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1428 1429 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1430 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1431 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1432 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1433 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1434 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1435 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1436 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1437 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1438 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1439 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1440 1441 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1442 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1443 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1444 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1445 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1446 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1447 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1448 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1449 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1450 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1451 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1452 1453 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1454 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1455 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1456 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1457 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1458 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1459 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1460 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1461 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1462 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1463 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1464 } 1465 return nullptr; 1466 } 1467 1468 TargetLoweringBase::LegalizeTypeAction 1469 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1470 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1471 VT.getScalarType() == MVT::i1) 1472 return TypeSplitVector; 1473 if (Isv2x16VT(VT)) 1474 return TypeLegal; 1475 return TargetLoweringBase::getPreferredVectorAction(VT); 1476 } 1477 1478 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1479 int Enabled, int &ExtraSteps, 1480 bool &UseOneConst, 1481 bool Reciprocal) const { 1482 if (!(Enabled == ReciprocalEstimate::Enabled || 1483 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1484 return SDValue(); 1485 1486 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1487 ExtraSteps = 0; 1488 1489 SDLoc DL(Operand); 1490 EVT VT = Operand.getValueType(); 1491 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1492 1493 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1495 DAG.getConstant(IID, DL, MVT::i32), Operand); 1496 }; 1497 1498 // The sqrt and rsqrt refinement processes assume we always start out with an 1499 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1500 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1501 // any refinement, we must return a regular sqrt. 1502 if (Reciprocal || ExtraSteps > 0) { 1503 if (VT == MVT::f32) 1504 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1505 : Intrinsic::nvvm_rsqrt_approx_f); 1506 else if (VT == MVT::f64) 1507 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1508 else 1509 return SDValue(); 1510 } else { 1511 if (VT == MVT::f32) 1512 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1513 : Intrinsic::nvvm_sqrt_approx_f); 1514 else { 1515 // There's no sqrt.approx.f64 instruction, so we emit 1516 // reciprocal(rsqrt(x)). This is faster than 1517 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1518 // x * rsqrt(x).) 1519 return DAG.getNode( 1520 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1521 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1522 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1523 } 1524 } 1525 } 1526 1527 SDValue 1528 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1529 SDLoc dl(Op); 1530 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1531 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1532 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1533 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1534 } 1535 1536 static bool IsTypePassedAsArray(const Type *Ty) { 1537 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1538 Ty->isHalfTy() || Ty->isBFloatTy(); 1539 } 1540 1541 std::string NVPTXTargetLowering::getPrototype( 1542 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1543 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1544 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1545 const CallBase &CB, unsigned UniqueCallSite) const { 1546 auto PtrVT = getPointerTy(DL); 1547 1548 bool isABI = (STI.getSmVersion() >= 20); 1549 assert(isABI && "Non-ABI compilation is not supported"); 1550 if (!isABI) 1551 return ""; 1552 1553 std::string Prototype; 1554 raw_string_ostream O(Prototype); 1555 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1556 1557 if (retTy->getTypeID() == Type::VoidTyID) { 1558 O << "()"; 1559 } else { 1560 O << "("; 1561 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1562 !IsTypePassedAsArray(retTy)) { 1563 unsigned size = 0; 1564 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1565 size = ITy->getBitWidth(); 1566 } else { 1567 assert(retTy->isFloatingPointTy() && 1568 "Floating point type expected here"); 1569 size = retTy->getPrimitiveSizeInBits(); 1570 } 1571 // PTX ABI requires all scalar return values to be at least 32 1572 // bits in size. fp16 normally uses .b16 as its storage type in 1573 // PTX, so its size must be adjusted here, too. 1574 size = promoteScalarArgumentSize(size); 1575 1576 O << ".param .b" << size << " _"; 1577 } else if (isa<PointerType>(retTy)) { 1578 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1579 } else if (IsTypePassedAsArray(retTy)) { 1580 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1581 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1582 } else { 1583 llvm_unreachable("Unknown return type"); 1584 } 1585 O << ") "; 1586 } 1587 O << "_ ("; 1588 1589 bool first = true; 1590 1591 const Function *F = CB.getFunction(); 1592 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1593 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1594 Type *Ty = Args[i].Ty; 1595 if (!first) { 1596 O << ", "; 1597 } 1598 first = false; 1599 1600 if (!Outs[OIdx].Flags.isByVal()) { 1601 if (IsTypePassedAsArray(Ty)) { 1602 unsigned ParamAlign = 0; 1603 const CallInst *CallI = cast<CallInst>(&CB); 1604 // +1 because index 0 is reserved for return type alignment 1605 if (!getAlign(*CallI, i + 1, ParamAlign)) 1606 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1607 O << ".param .align " << ParamAlign << " .b8 "; 1608 O << "_"; 1609 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1610 // update the index for Outs 1611 SmallVector<EVT, 16> vtparts; 1612 ComputeValueVTs(*this, DL, Ty, vtparts); 1613 if (unsigned len = vtparts.size()) 1614 OIdx += len - 1; 1615 continue; 1616 } 1617 // i8 types in IR will be i16 types in SDAG 1618 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1619 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1620 "type mismatch between callee prototype and arguments"); 1621 // scalar type 1622 unsigned sz = 0; 1623 if (isa<IntegerType>(Ty)) { 1624 sz = cast<IntegerType>(Ty)->getBitWidth(); 1625 sz = promoteScalarArgumentSize(sz); 1626 } else if (isa<PointerType>(Ty)) { 1627 sz = PtrVT.getSizeInBits(); 1628 } else { 1629 sz = Ty->getPrimitiveSizeInBits(); 1630 } 1631 O << ".param .b" << sz << " "; 1632 O << "_"; 1633 continue; 1634 } 1635 1636 Type *ETy = Args[i].IndirectType; 1637 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1638 Align ParamByValAlign = 1639 getFunctionByValParamAlign(F, ETy, InitialAlign, DL); 1640 1641 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1642 O << "_"; 1643 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1644 } 1645 1646 if (VAInfo) 1647 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1648 << " .b8 _[]\n"; 1649 O << ")"; 1650 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1651 O << " .noreturn"; 1652 O << ";"; 1653 1654 return Prototype; 1655 } 1656 1657 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1658 const CallBase *CB, Type *Ty, 1659 unsigned Idx, 1660 const DataLayout &DL) const { 1661 if (!CB) { 1662 // CallSite is zero, fallback to ABI type alignment 1663 return DL.getABITypeAlign(Ty); 1664 } 1665 1666 unsigned Alignment = 0; 1667 const Function *DirectCallee = CB->getCalledFunction(); 1668 1669 if (!DirectCallee) { 1670 // We don't have a direct function symbol, but that may be because of 1671 // constant cast instructions in the call. 1672 1673 // With bitcast'd call targets, the instruction will be the call 1674 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1675 // Check if we have call alignment metadata 1676 if (getAlign(*CI, Idx, Alignment)) 1677 return Align(Alignment); 1678 } 1679 DirectCallee = getMaybeBitcastedCallee(CB); 1680 } 1681 1682 // Check for function alignment information if we found that the 1683 // ultimate target is a Function 1684 if (DirectCallee) { 1685 if (getAlign(*DirectCallee, Idx, Alignment)) 1686 return Align(Alignment); 1687 // If alignment information is not available, fall back to the 1688 // default function param optimized type alignment 1689 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1690 } 1691 1692 // Call is indirect, fall back to the ABI type alignment 1693 return DL.getABITypeAlign(Ty); 1694 } 1695 1696 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1697 SmallVectorImpl<SDValue> &InVals) const { 1698 1699 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1700 report_fatal_error( 1701 "Support for variadic functions (unsized array parameter) introduced " 1702 "in PTX ISA version 6.0 and requires target sm_30."); 1703 1704 SelectionDAG &DAG = CLI.DAG; 1705 SDLoc dl = CLI.DL; 1706 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1707 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1708 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1709 SDValue Chain = CLI.Chain; 1710 SDValue Callee = CLI.Callee; 1711 bool &isTailCall = CLI.IsTailCall; 1712 ArgListTy &Args = CLI.getArgs(); 1713 Type *RetTy = CLI.RetTy; 1714 const CallBase *CB = CLI.CB; 1715 const DataLayout &DL = DAG.getDataLayout(); 1716 1717 bool isABI = (STI.getSmVersion() >= 20); 1718 assert(isABI && "Non-ABI compilation is not supported"); 1719 if (!isABI) 1720 return Chain; 1721 1722 // Variadic arguments. 1723 // 1724 // Normally, for each argument, we declare a param scalar or a param 1725 // byte array in the .param space, and store the argument value to that 1726 // param scalar or array starting at offset 0. 1727 // 1728 // In the case of the first variadic argument, we declare a vararg byte array 1729 // with size 0. The exact size of this array isn't known at this point, so 1730 // it'll be patched later. All the variadic arguments will be stored to this 1731 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1732 // initially set to 0, so it can be used for non-variadic arguments (which use 1733 // 0 offset) to simplify the code. 1734 // 1735 // After all vararg is processed, 'VAOffset' holds the size of the 1736 // vararg byte array. 1737 1738 SDValue VADeclareParam; // vararg byte array 1739 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1740 unsigned VAOffset = 0; // current offset in the param array 1741 1742 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1743 SDValue TempChain = Chain; 1744 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1745 SDValue InGlue = Chain.getValue(1); 1746 1747 unsigned ParamCount = 0; 1748 // Args.size() and Outs.size() need not match. 1749 // Outs.size() will be larger 1750 // * if there is an aggregate argument with multiple fields (each field 1751 // showing up separately in Outs) 1752 // * if there is a vector argument with more than typical vector-length 1753 // elements (generally if more than 4) where each vector element is 1754 // individually present in Outs. 1755 // So a different index should be used for indexing into Outs/OutVals. 1756 // See similar issue in LowerFormalArguments. 1757 unsigned OIdx = 0; 1758 // Declare the .params or .reg need to pass values 1759 // to the function 1760 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1761 EVT VT = Outs[OIdx].VT; 1762 Type *Ty = Args[i].Ty; 1763 bool IsVAArg = (i >= CLI.NumFixedArgs); 1764 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1765 1766 SmallVector<EVT, 16> VTs; 1767 SmallVector<uint64_t, 16> Offsets; 1768 1769 assert((!IsByVal || Args[i].IndirectType) && 1770 "byval arg must have indirect type"); 1771 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1772 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1773 1774 Align ArgAlign; 1775 if (IsByVal) { 1776 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1777 // so we don't need to worry whether it's naturally aligned or not. 1778 // See TargetLowering::LowerCallTo(). 1779 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1780 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1781 InitialAlign, DL); 1782 if (IsVAArg) 1783 VAOffset = alignTo(VAOffset, ArgAlign); 1784 } else { 1785 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); 1786 } 1787 1788 unsigned TypeSize = 1789 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1790 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1791 1792 bool NeedAlign; // Does argument declaration specify alignment? 1793 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1794 if (IsVAArg) { 1795 if (ParamCount == FirstVAArg) { 1796 SDValue DeclareParamOps[] = { 1797 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1798 DAG.getConstant(ParamCount, dl, MVT::i32), 1799 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1800 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1801 DeclareParamVTs, DeclareParamOps); 1802 } 1803 NeedAlign = PassAsArray; 1804 } else if (PassAsArray) { 1805 // declare .param .align <align> .b8 .param<n>[<size>]; 1806 SDValue DeclareParamOps[] = { 1807 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1808 DAG.getConstant(ParamCount, dl, MVT::i32), 1809 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1810 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1811 DeclareParamOps); 1812 NeedAlign = true; 1813 } else { 1814 // declare .param .b<size> .param<n>; 1815 if (VT.isInteger() || VT.isFloatingPoint()) { 1816 // PTX ABI requires integral types to be at least 32 bits in 1817 // size. FP16 is loaded/stored using i16, so it's handled 1818 // here as well. 1819 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1820 } 1821 SDValue DeclareScalarParamOps[] = { 1822 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1823 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1824 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1825 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1826 DeclareScalarParamOps); 1827 NeedAlign = false; 1828 } 1829 InGlue = Chain.getValue(1); 1830 1831 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1832 // than 32-bits are sign extended or zero extended, depending on 1833 // whether they are signed or unsigned types. This case applies 1834 // only to scalar parameters and not to aggregate values. 1835 bool ExtendIntegerParam = 1836 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1837 1838 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1839 SmallVector<SDValue, 6> StoreOperands; 1840 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1841 EVT EltVT = VTs[j]; 1842 int CurOffset = Offsets[j]; 1843 MaybeAlign PartAlign; 1844 if (NeedAlign) 1845 PartAlign = commonAlignment(ArgAlign, CurOffset); 1846 1847 // New store. 1848 if (VectorInfo[j] & PVF_FIRST) { 1849 assert(StoreOperands.empty() && "Unfinished preceding store."); 1850 StoreOperands.push_back(Chain); 1851 StoreOperands.push_back( 1852 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1853 StoreOperands.push_back(DAG.getConstant( 1854 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1855 dl, MVT::i32)); 1856 } 1857 1858 SDValue StVal = OutVals[OIdx]; 1859 1860 MVT PromotedVT; 1861 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1862 EltVT = EVT(PromotedVT); 1863 } 1864 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1865 llvm::ISD::NodeType Ext = 1866 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1867 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1868 } 1869 1870 if (IsByVal) { 1871 auto PtrVT = getPointerTy(DL); 1872 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1873 DAG.getConstant(CurOffset, dl, PtrVT)); 1874 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1875 PartAlign); 1876 } else if (ExtendIntegerParam) { 1877 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1878 // zext/sext to i32 1879 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1880 : ISD::ZERO_EXTEND, 1881 dl, MVT::i32, StVal); 1882 } 1883 1884 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1885 // Use 16-bit registers for small stores as it's the 1886 // smallest general purpose register size supported by NVPTX. 1887 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1888 } 1889 1890 // Record the value to store. 1891 StoreOperands.push_back(StVal); 1892 1893 if (VectorInfo[j] & PVF_LAST) { 1894 unsigned NumElts = StoreOperands.size() - 3; 1895 NVPTXISD::NodeType Op; 1896 switch (NumElts) { 1897 case 1: 1898 Op = NVPTXISD::StoreParam; 1899 break; 1900 case 2: 1901 Op = NVPTXISD::StoreParamV2; 1902 break; 1903 case 4: 1904 Op = NVPTXISD::StoreParamV4; 1905 break; 1906 default: 1907 llvm_unreachable("Invalid vector info."); 1908 } 1909 1910 StoreOperands.push_back(InGlue); 1911 1912 // Adjust type of the store op if we've extended the scalar 1913 // return value. 1914 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1915 1916 Chain = DAG.getMemIntrinsicNode( 1917 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1918 TheStoreType, MachinePointerInfo(), PartAlign, 1919 MachineMemOperand::MOStore); 1920 InGlue = Chain.getValue(1); 1921 1922 // Cleanup. 1923 StoreOperands.clear(); 1924 1925 // TODO: We may need to support vector types that can be passed 1926 // as scalars in variadic arguments. 1927 if (!IsByVal && IsVAArg) { 1928 assert(NumElts == 1 && 1929 "Vectorization is expected to be disabled for variadics."); 1930 VAOffset += DL.getTypeAllocSize( 1931 TheStoreType.getTypeForEVT(*DAG.getContext())); 1932 } 1933 } 1934 if (!IsByVal) 1935 ++OIdx; 1936 } 1937 assert(StoreOperands.empty() && "Unfinished parameter store."); 1938 if (!IsByVal && VTs.size() > 0) 1939 --OIdx; 1940 ++ParamCount; 1941 if (IsByVal && IsVAArg) 1942 VAOffset += TypeSize; 1943 } 1944 1945 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1946 MaybeAlign retAlignment = std::nullopt; 1947 1948 // Handle Result 1949 if (Ins.size() > 0) { 1950 SmallVector<EVT, 16> resvtparts; 1951 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1952 1953 // Declare 1954 // .param .align N .b8 retval0[<size-in-bytes>], or 1955 // .param .b<size-in-bits> retval0 1956 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1957 if (!IsTypePassedAsArray(RetTy)) { 1958 resultsz = promoteScalarArgumentSize(resultsz); 1959 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1960 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1961 DAG.getConstant(resultsz, dl, MVT::i32), 1962 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1963 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1964 DeclareRetOps); 1965 InGlue = Chain.getValue(1); 1966 } else { 1967 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1968 assert(retAlignment && "retAlignment is guaranteed to be set"); 1969 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1970 SDValue DeclareRetOps[] = { 1971 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1972 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1973 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1974 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1975 DeclareRetOps); 1976 InGlue = Chain.getValue(1); 1977 } 1978 } 1979 1980 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1981 // Set the size of the vararg param byte array if the callee is a variadic 1982 // function and the variadic part is not empty. 1983 if (HasVAArgs) { 1984 SDValue DeclareParamOps[] = { 1985 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1986 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1987 VADeclareParam.getOperand(4)}; 1988 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1989 VADeclareParam->getVTList(), DeclareParamOps); 1990 } 1991 1992 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1993 // between them we must rely on the call site value which is valid for 1994 // indirect calls but is always null for libcalls. 1995 bool isIndirectCall = !Func && CB; 1996 1997 if (isa<ExternalSymbolSDNode>(Callee)) { 1998 Function* CalleeFunc = nullptr; 1999 2000 // Try to find the callee in the current module. 2001 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 2002 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 2003 2004 // Set the "libcall callee" attribute to indicate that the function 2005 // must always have a declaration. 2006 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 2007 } 2008 2009 if (isIndirectCall) { 2010 // This is indirect function call case : PTX requires a prototype of the 2011 // form 2012 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 2013 // to be emitted, and the label has to used as the last arg of call 2014 // instruction. 2015 // The prototype is embedded in a string and put as the operand for a 2016 // CallPrototype SDNode which will print out to the value of the string. 2017 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2018 std::string Proto = getPrototype( 2019 DL, RetTy, Args, Outs, retAlignment, 2020 HasVAArgs 2021 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 2022 CLI.NumFixedArgs, 2023 cast<ConstantSDNode>(VADeclareParam->getOperand(1)) 2024 ->getAPIntValue())) 2025 : std::nullopt, 2026 *CB, UniqueCallSite); 2027 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 2028 SDValue ProtoOps[] = { 2029 Chain, 2030 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 2031 InGlue, 2032 }; 2033 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 2034 InGlue = Chain.getValue(1); 2035 } 2036 // Op to just print "call" 2037 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2038 SDValue PrintCallOps[] = { 2039 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 2040 }; 2041 // We model convergent calls as separate opcodes. 2042 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 2043 if (CLI.IsConvergent) 2044 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 2045 : NVPTXISD::PrintConvergentCall; 2046 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 2047 InGlue = Chain.getValue(1); 2048 2049 // Ops to print out the function name 2050 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2051 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 2052 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 2053 InGlue = Chain.getValue(1); 2054 2055 // Ops to print out the param list 2056 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2057 SDValue CallArgBeginOps[] = { Chain, InGlue }; 2058 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 2059 CallArgBeginOps); 2060 InGlue = Chain.getValue(1); 2061 2062 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 2063 ++i) { 2064 unsigned opcode; 2065 if (i == (e - 1)) 2066 opcode = NVPTXISD::LastCallArg; 2067 else 2068 opcode = NVPTXISD::CallArg; 2069 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2070 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 2071 DAG.getConstant(i, dl, MVT::i32), InGlue }; 2072 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 2073 InGlue = Chain.getValue(1); 2074 } 2075 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2076 SDValue CallArgEndOps[] = { Chain, 2077 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 2078 InGlue }; 2079 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 2080 InGlue = Chain.getValue(1); 2081 2082 if (isIndirectCall) { 2083 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2084 SDValue PrototypeOps[] = { 2085 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 2086 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 2087 InGlue = Chain.getValue(1); 2088 } 2089 2090 SmallVector<SDValue, 16> ProxyRegOps; 2091 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 2092 2093 // Generate loads from param memory/moves from registers for result 2094 if (Ins.size() > 0) { 2095 SmallVector<EVT, 16> VTs; 2096 SmallVector<uint64_t, 16> Offsets; 2097 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 2098 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 2099 2100 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 2101 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 2102 2103 SmallVector<EVT, 6> LoadVTs; 2104 int VecIdx = -1; // Index of the first element of the vector. 2105 2106 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2107 // 32-bits are sign extended or zero extended, depending on whether 2108 // they are signed or unsigned types. 2109 bool ExtendIntegerRetVal = 2110 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2111 2112 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2113 bool needTruncate = false; 2114 EVT TheLoadType = VTs[i]; 2115 EVT EltType = Ins[i].VT; 2116 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 2117 MVT PromotedVT; 2118 2119 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 2120 TheLoadType = EVT(PromotedVT); 2121 EltType = EVT(PromotedVT); 2122 needTruncate = true; 2123 } 2124 2125 if (ExtendIntegerRetVal) { 2126 TheLoadType = MVT::i32; 2127 EltType = MVT::i32; 2128 needTruncate = true; 2129 } else if (TheLoadType.getSizeInBits() < 16) { 2130 if (VTs[i].isInteger()) 2131 needTruncate = true; 2132 EltType = MVT::i16; 2133 } 2134 2135 // Record index of the very first element of the vector. 2136 if (VectorInfo[i] & PVF_FIRST) { 2137 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2138 VecIdx = i; 2139 } 2140 2141 LoadVTs.push_back(EltType); 2142 2143 if (VectorInfo[i] & PVF_LAST) { 2144 unsigned NumElts = LoadVTs.size(); 2145 LoadVTs.push_back(MVT::Other); 2146 LoadVTs.push_back(MVT::Glue); 2147 NVPTXISD::NodeType Op; 2148 switch (NumElts) { 2149 case 1: 2150 Op = NVPTXISD::LoadParam; 2151 break; 2152 case 2: 2153 Op = NVPTXISD::LoadParamV2; 2154 break; 2155 case 4: 2156 Op = NVPTXISD::LoadParamV4; 2157 break; 2158 default: 2159 llvm_unreachable("Invalid vector info."); 2160 } 2161 2162 SDValue LoadOperands[] = { 2163 Chain, DAG.getConstant(1, dl, MVT::i32), 2164 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2165 SDValue RetVal = DAG.getMemIntrinsicNode( 2166 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2167 MachinePointerInfo(), EltAlign, 2168 MachineMemOperand::MOLoad); 2169 2170 for (unsigned j = 0; j < NumElts; ++j) { 2171 ProxyRegOps.push_back(RetVal.getValue(j)); 2172 2173 if (needTruncate) 2174 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2175 else 2176 ProxyRegTruncates.push_back(std::optional<MVT>()); 2177 } 2178 2179 Chain = RetVal.getValue(NumElts); 2180 InGlue = RetVal.getValue(NumElts + 1); 2181 2182 // Cleanup 2183 VecIdx = -1; 2184 LoadVTs.clear(); 2185 } 2186 } 2187 } 2188 2189 Chain = 2190 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2191 InGlue = Chain.getValue(1); 2192 2193 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2194 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2195 // dangling. 2196 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2197 SDValue Ret = DAG.getNode( 2198 NVPTXISD::ProxyReg, dl, 2199 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2200 { Chain, ProxyRegOps[i], InGlue } 2201 ); 2202 2203 Chain = Ret.getValue(1); 2204 InGlue = Ret.getValue(2); 2205 2206 if (ProxyRegTruncates[i]) { 2207 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2208 } 2209 2210 InVals.push_back(Ret); 2211 } 2212 2213 // set isTailCall to false for now, until we figure out how to express 2214 // tail call optimization in PTX 2215 isTailCall = false; 2216 return Chain; 2217 } 2218 2219 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 2220 SelectionDAG &DAG) const { 2221 const Function &Fn = DAG.getMachineFunction().getFunction(); 2222 2223 DiagnosticInfoUnsupported NoDynamicAlloca( 2224 Fn, "dynamic alloca unsupported by NVPTX backend", 2225 SDLoc(Op).getDebugLoc()); 2226 DAG.getContext()->diagnose(NoDynamicAlloca); 2227 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 2228 return DAG.getMergeValues(Ops, SDLoc()); 2229 } 2230 2231 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2232 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2233 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2234 SDValue 2235 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2236 SDNode *Node = Op.getNode(); 2237 SDLoc dl(Node); 2238 SmallVector<SDValue, 8> Ops; 2239 unsigned NumOperands = Node->getNumOperands(); 2240 for (unsigned i = 0; i < NumOperands; ++i) { 2241 SDValue SubOp = Node->getOperand(i); 2242 EVT VVT = SubOp.getNode()->getValueType(0); 2243 EVT EltVT = VVT.getVectorElementType(); 2244 unsigned NumSubElem = VVT.getVectorNumElements(); 2245 for (unsigned j = 0; j < NumSubElem; ++j) { 2246 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2247 DAG.getIntPtrConstant(j, dl))); 2248 } 2249 } 2250 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2251 } 2252 2253 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2254 // would get lowered as two constant loads and vector-packing move. 2255 // Instead we want just a constant move: 2256 // mov.b32 %r2, 0x40003C00 2257 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2258 SelectionDAG &DAG) const { 2259 EVT VT = Op->getValueType(0); 2260 if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) 2261 return Op; 2262 2263 SDLoc DL(Op); 2264 2265 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { 2266 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || 2267 isa<ConstantFPSDNode>(Operand); 2268 })) { 2269 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2270 // to optimize calculation of constant parts. 2271 if (VT == MVT::v4i8) { 2272 SDValue C8 = DAG.getConstant(8, DL, MVT::i32); 2273 SDValue E01 = DAG.getNode( 2274 NVPTXISD::BFI, DL, MVT::i32, 2275 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), 2276 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); 2277 SDValue E012 = 2278 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2279 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), 2280 E01, DAG.getConstant(16, DL, MVT::i32), C8); 2281 SDValue E0123 = 2282 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2283 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), 2284 E012, DAG.getConstant(24, DL, MVT::i32), C8); 2285 return DAG.getNode(ISD::BITCAST, DL, VT, E0123); 2286 } 2287 return Op; 2288 } 2289 2290 // Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2291 auto GetOperand = [](SDValue Op, int N) -> APInt { 2292 const SDValue &Operand = Op->getOperand(N); 2293 EVT VT = Op->getValueType(0); 2294 if (Operand->isUndef()) 2295 return APInt(32, 0); 2296 APInt Value; 2297 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2298 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); 2299 else if (VT == MVT::v2i16 || VT == MVT::v4i8) 2300 Value = cast<ConstantSDNode>(Operand)->getAPIntValue(); 2301 else 2302 llvm_unreachable("Unsupported type"); 2303 // i8 values are carried around as i16, so we need to zero out upper bits, 2304 // so they do not get in the way of combining individual byte values 2305 if (VT == MVT::v4i8) 2306 Value = Value.trunc(8); 2307 return Value.zext(32); 2308 }; 2309 APInt Value; 2310 if (Isv2x16VT(VT)) { 2311 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); 2312 } else if (VT == MVT::v4i8) { 2313 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | 2314 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); 2315 } else { 2316 llvm_unreachable("Unsupported type"); 2317 } 2318 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); 2319 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2320 } 2321 2322 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2323 SelectionDAG &DAG) const { 2324 SDValue Index = Op->getOperand(1); 2325 SDValue Vector = Op->getOperand(0); 2326 SDLoc DL(Op); 2327 EVT VectorVT = Vector.getValueType(); 2328 2329 if (VectorVT == MVT::v4i8) { 2330 SDValue BFE = 2331 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, 2332 {Vector, 2333 DAG.getNode(ISD::MUL, DL, MVT::i32, 2334 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2335 DAG.getConstant(8, DL, MVT::i32)), 2336 DAG.getConstant(8, DL, MVT::i32)}); 2337 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); 2338 } 2339 2340 // Constant index will be matched by tablegen. 2341 if (isa<ConstantSDNode>(Index.getNode())) 2342 return Op; 2343 2344 // Extract individual elements and select one of them. 2345 assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); 2346 EVT EltVT = VectorVT.getVectorElementType(); 2347 2348 SDLoc dl(Op.getNode()); 2349 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2350 DAG.getIntPtrConstant(0, dl)); 2351 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2352 DAG.getIntPtrConstant(1, dl)); 2353 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2354 ISD::CondCode::SETEQ); 2355 } 2356 2357 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 2358 SelectionDAG &DAG) const { 2359 SDValue Vector = Op->getOperand(0); 2360 EVT VectorVT = Vector.getValueType(); 2361 2362 if (VectorVT != MVT::v4i8) 2363 return Op; 2364 SDLoc DL(Op); 2365 SDValue Value = Op->getOperand(1); 2366 if (Value->isUndef()) 2367 return Vector; 2368 2369 SDValue Index = Op->getOperand(2); 2370 2371 SDValue BFI = 2372 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2373 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, 2374 DAG.getNode(ISD::MUL, DL, MVT::i32, 2375 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2376 DAG.getConstant(8, DL, MVT::i32)), 2377 DAG.getConstant(8, DL, MVT::i32)}); 2378 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); 2379 } 2380 2381 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 2382 SelectionDAG &DAG) const { 2383 SDValue V1 = Op.getOperand(0); 2384 EVT VectorVT = V1.getValueType(); 2385 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) 2386 return Op; 2387 2388 // Lower shuffle to PRMT instruction. 2389 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 2390 SDValue V2 = Op.getOperand(1); 2391 uint32_t Selector = 0; 2392 for (auto I : llvm::enumerate(SVN->getMask())) 2393 Selector |= (I.value() << (I.index() * 4)); 2394 2395 SDLoc DL(Op); 2396 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, 2397 DAG.getConstant(Selector, DL, MVT::i32), 2398 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); 2399 } 2400 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2401 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2402 /// amount, or 2403 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2404 /// amount. 2405 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2406 SelectionDAG &DAG) const { 2407 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2408 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2409 2410 EVT VT = Op.getValueType(); 2411 unsigned VTBits = VT.getSizeInBits(); 2412 SDLoc dl(Op); 2413 SDValue ShOpLo = Op.getOperand(0); 2414 SDValue ShOpHi = Op.getOperand(1); 2415 SDValue ShAmt = Op.getOperand(2); 2416 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2417 2418 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2419 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2420 // {dHi, dLo} = {aHi, aLo} >> Amt 2421 // dHi = aHi >> Amt 2422 // dLo = shf.r.clamp aLo, aHi, Amt 2423 2424 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2425 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2426 ShAmt); 2427 2428 SDValue Ops[2] = { Lo, Hi }; 2429 return DAG.getMergeValues(Ops, dl); 2430 } 2431 else { 2432 // {dHi, dLo} = {aHi, aLo} >> Amt 2433 // - if (Amt>=size) then 2434 // dLo = aHi >> (Amt-size) 2435 // dHi = aHi >> Amt (this is either all 0 or all 1) 2436 // else 2437 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2438 // dHi = aHi >> Amt 2439 2440 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2441 DAG.getConstant(VTBits, dl, MVT::i32), 2442 ShAmt); 2443 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2444 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2445 DAG.getConstant(VTBits, dl, MVT::i32)); 2446 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2447 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2448 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2449 2450 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2451 DAG.getConstant(VTBits, dl, MVT::i32), 2452 ISD::SETGE); 2453 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2454 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2455 2456 SDValue Ops[2] = { Lo, Hi }; 2457 return DAG.getMergeValues(Ops, dl); 2458 } 2459 } 2460 2461 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2462 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2463 /// amount, or 2464 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2465 /// amount. 2466 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2467 SelectionDAG &DAG) const { 2468 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2469 assert(Op.getOpcode() == ISD::SHL_PARTS); 2470 2471 EVT VT = Op.getValueType(); 2472 unsigned VTBits = VT.getSizeInBits(); 2473 SDLoc dl(Op); 2474 SDValue ShOpLo = Op.getOperand(0); 2475 SDValue ShOpHi = Op.getOperand(1); 2476 SDValue ShAmt = Op.getOperand(2); 2477 2478 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2479 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2480 // {dHi, dLo} = {aHi, aLo} << Amt 2481 // dHi = shf.l.clamp aLo, aHi, Amt 2482 // dLo = aLo << Amt 2483 2484 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2485 ShAmt); 2486 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2487 2488 SDValue Ops[2] = { Lo, Hi }; 2489 return DAG.getMergeValues(Ops, dl); 2490 } 2491 else { 2492 // {dHi, dLo} = {aHi, aLo} << Amt 2493 // - if (Amt>=size) then 2494 // dLo = aLo << Amt (all 0) 2495 // dLo = aLo << (Amt-size) 2496 // else 2497 // dLo = aLo << Amt 2498 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2499 2500 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2501 DAG.getConstant(VTBits, dl, MVT::i32), 2502 ShAmt); 2503 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2504 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2505 DAG.getConstant(VTBits, dl, MVT::i32)); 2506 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2507 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2508 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2509 2510 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2511 DAG.getConstant(VTBits, dl, MVT::i32), 2512 ISD::SETGE); 2513 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2514 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2515 2516 SDValue Ops[2] = { Lo, Hi }; 2517 return DAG.getMergeValues(Ops, dl); 2518 } 2519 } 2520 2521 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2522 EVT VT = Op.getValueType(); 2523 2524 if (VT == MVT::f32) 2525 return LowerFROUND32(Op, DAG); 2526 2527 if (VT == MVT::f64) 2528 return LowerFROUND64(Op, DAG); 2529 2530 llvm_unreachable("unhandled type"); 2531 } 2532 2533 // This is the the rounding method used in CUDA libdevice in C like code: 2534 // float roundf(float A) 2535 // { 2536 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2537 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2538 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2539 // } 2540 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2541 SelectionDAG &DAG) const { 2542 SDLoc SL(Op); 2543 SDValue A = Op.getOperand(0); 2544 EVT VT = Op.getValueType(); 2545 2546 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2547 2548 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2549 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2550 const int SignBitMask = 0x80000000; 2551 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2552 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2553 const int PointFiveInBits = 0x3F000000; 2554 SDValue PointFiveWithSignRaw = 2555 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2556 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2557 SDValue PointFiveWithSign = 2558 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2559 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2560 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2561 2562 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2563 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2564 SDValue IsLarge = 2565 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2566 ISD::SETOGT); 2567 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2568 2569 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2570 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2571 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2572 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2573 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2574 } 2575 2576 // The implementation of round(double) is similar to that of round(float) in 2577 // that they both separate the value range into three regions and use a method 2578 // specific to the region to round the values. However, round(double) first 2579 // calculates the round of the absolute value and then adds the sign back while 2580 // round(float) directly rounds the value with sign. 2581 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2582 SelectionDAG &DAG) const { 2583 SDLoc SL(Op); 2584 SDValue A = Op.getOperand(0); 2585 EVT VT = Op.getValueType(); 2586 2587 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2588 2589 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2590 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2591 DAG.getConstantFP(0.5, SL, VT)); 2592 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2593 2594 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2595 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2596 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2597 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2598 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2599 DAG.getConstantFP(0, SL, VT), 2600 RoundedA); 2601 2602 // Add sign to rounded_A 2603 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2604 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2605 2606 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2607 SDValue IsLarge = 2608 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2609 ISD::SETOGT); 2610 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2611 } 2612 2613 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, 2614 SelectionDAG &DAG) const { 2615 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2616 2617 if (Op.getValueType() == MVT::bf16) { 2618 SDLoc Loc(Op); 2619 return DAG.getNode( 2620 ISD::FP_ROUND, Loc, MVT::bf16, 2621 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), 2622 DAG.getIntPtrConstant(0, Loc)); 2623 } 2624 2625 // Everything else is considered legal. 2626 return Op; 2627 } 2628 2629 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, 2630 SelectionDAG &DAG) const { 2631 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2632 2633 if (Op.getOperand(0).getValueType() == MVT::bf16) { 2634 SDLoc Loc(Op); 2635 return DAG.getNode( 2636 Op.getOpcode(), Loc, Op.getValueType(), 2637 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); 2638 } 2639 2640 // Everything else is considered legal. 2641 return Op; 2642 } 2643 2644 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { 2645 SDLoc DL(Op); 2646 if (Op.getValueType() != MVT::v2i16) 2647 return Op; 2648 EVT EltVT = Op.getValueType().getVectorElementType(); 2649 SmallVector<SDValue> VecElements; 2650 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { 2651 SmallVector<SDValue> ScalarArgs; 2652 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), 2653 [&](const SDUse &O) { 2654 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 2655 O.get(), DAG.getIntPtrConstant(I, DL)); 2656 }); 2657 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); 2658 } 2659 SDValue V = 2660 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); 2661 return V; 2662 } 2663 2664 SDValue 2665 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2666 switch (Op.getOpcode()) { 2667 case ISD::RETURNADDR: 2668 return SDValue(); 2669 case ISD::FRAMEADDR: 2670 return SDValue(); 2671 case ISD::GlobalAddress: 2672 return LowerGlobalAddress(Op, DAG); 2673 case ISD::INTRINSIC_W_CHAIN: 2674 return Op; 2675 case ISD::BUILD_VECTOR: 2676 return LowerBUILD_VECTOR(Op, DAG); 2677 case ISD::EXTRACT_SUBVECTOR: 2678 return Op; 2679 case ISD::EXTRACT_VECTOR_ELT: 2680 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2681 case ISD::INSERT_VECTOR_ELT: 2682 return LowerINSERT_VECTOR_ELT(Op, DAG); 2683 case ISD::VECTOR_SHUFFLE: 2684 return LowerVECTOR_SHUFFLE(Op, DAG); 2685 case ISD::CONCAT_VECTORS: 2686 return LowerCONCAT_VECTORS(Op, DAG); 2687 case ISD::STORE: 2688 return LowerSTORE(Op, DAG); 2689 case ISD::LOAD: 2690 return LowerLOAD(Op, DAG); 2691 case ISD::SHL_PARTS: 2692 return LowerShiftLeftParts(Op, DAG); 2693 case ISD::SRA_PARTS: 2694 case ISD::SRL_PARTS: 2695 return LowerShiftRightParts(Op, DAG); 2696 case ISD::SELECT: 2697 return LowerSelect(Op, DAG); 2698 case ISD::FROUND: 2699 return LowerFROUND(Op, DAG); 2700 case ISD::SINT_TO_FP: 2701 case ISD::UINT_TO_FP: 2702 return LowerINT_TO_FP(Op, DAG); 2703 case ISD::FP_TO_SINT: 2704 case ISD::FP_TO_UINT: 2705 return LowerFP_TO_INT(Op, DAG); 2706 case ISD::VAARG: 2707 return LowerVAARG(Op, DAG); 2708 case ISD::VASTART: 2709 return LowerVASTART(Op, DAG); 2710 case ISD::ABS: 2711 case ISD::SMIN: 2712 case ISD::SMAX: 2713 case ISD::UMIN: 2714 case ISD::UMAX: 2715 case ISD::ADD: 2716 case ISD::SUB: 2717 case ISD::MUL: 2718 case ISD::SHL: 2719 case ISD::SREM: 2720 case ISD::UREM: 2721 return LowerVectorArith(Op, DAG); 2722 case ISD::DYNAMIC_STACKALLOC: 2723 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2724 default: 2725 llvm_unreachable("Custom lowering not defined for operation"); 2726 } 2727 } 2728 2729 // This function is almost a copy of SelectionDAG::expandVAArg(). 2730 // The only diff is that this one produces loads from local address space. 2731 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2732 const TargetLowering *TLI = STI.getTargetLowering(); 2733 SDLoc DL(Op); 2734 2735 SDNode *Node = Op.getNode(); 2736 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2737 EVT VT = Node->getValueType(0); 2738 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2739 SDValue Tmp1 = Node->getOperand(0); 2740 SDValue Tmp2 = Node->getOperand(1); 2741 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2742 2743 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2744 Tmp1, Tmp2, MachinePointerInfo(V)); 2745 SDValue VAList = VAListLoad; 2746 2747 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2748 VAList = DAG.getNode( 2749 ISD::ADD, DL, VAList.getValueType(), VAList, 2750 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2751 2752 VAList = DAG.getNode( 2753 ISD::AND, DL, VAList.getValueType(), VAList, 2754 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2755 } 2756 2757 // Increment the pointer, VAList, to the next vaarg 2758 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2759 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2760 DL, VAList.getValueType())); 2761 2762 // Store the incremented VAList to the legalized pointer 2763 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2764 MachinePointerInfo(V)); 2765 2766 const Value *SrcV = 2767 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2768 2769 // Load the actual argument out of the pointer VAList 2770 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2771 } 2772 2773 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2774 const TargetLowering *TLI = STI.getTargetLowering(); 2775 SDLoc DL(Op); 2776 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2777 2778 // Store the address of unsized array <function>_vararg[] in the ap object. 2779 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2780 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2781 2782 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2783 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2784 MachinePointerInfo(SV)); 2785 } 2786 2787 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2788 SDValue Op0 = Op->getOperand(0); 2789 SDValue Op1 = Op->getOperand(1); 2790 SDValue Op2 = Op->getOperand(2); 2791 SDLoc DL(Op.getNode()); 2792 2793 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2794 2795 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2796 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2797 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2798 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2799 2800 return Trunc; 2801 } 2802 2803 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2804 if (Op.getValueType() == MVT::i1) 2805 return LowerLOADi1(Op, DAG); 2806 2807 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2808 // unaligned loads and have to handle it here. 2809 EVT VT = Op.getValueType(); 2810 if (Isv2x16VT(VT) || VT == MVT::v4i8) { 2811 LoadSDNode *Load = cast<LoadSDNode>(Op); 2812 EVT MemVT = Load->getMemoryVT(); 2813 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2814 MemVT, *Load->getMemOperand())) { 2815 SDValue Ops[2]; 2816 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2817 return DAG.getMergeValues(Ops, SDLoc(Op)); 2818 } 2819 } 2820 2821 return SDValue(); 2822 } 2823 2824 // v = ld i1* addr 2825 // => 2826 // v1 = ld i8* addr (-> i16) 2827 // v = trunc i16 to i1 2828 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2829 SDNode *Node = Op.getNode(); 2830 LoadSDNode *LD = cast<LoadSDNode>(Node); 2831 SDLoc dl(Node); 2832 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2833 assert(Node->getValueType(0) == MVT::i1 && 2834 "Custom lowering for i1 load only"); 2835 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2836 LD->getPointerInfo(), LD->getAlign(), 2837 LD->getMemOperand()->getFlags()); 2838 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2839 // The legalizer (the caller) is expecting two values from the legalized 2840 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2841 // in LegalizeDAG.cpp which also uses MergeValues. 2842 SDValue Ops[] = { result, LD->getChain() }; 2843 return DAG.getMergeValues(Ops, dl); 2844 } 2845 2846 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2847 StoreSDNode *Store = cast<StoreSDNode>(Op); 2848 EVT VT = Store->getMemoryVT(); 2849 2850 if (VT == MVT::i1) 2851 return LowerSTOREi1(Op, DAG); 2852 2853 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2854 // stores and have to handle it here. 2855 if ((Isv2x16VT(VT) || VT == MVT::v4i8) && 2856 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2857 VT, *Store->getMemOperand())) 2858 return expandUnalignedStore(Store, DAG); 2859 2860 // v2f16, v2bf16 and v2i16 don't need special handling. 2861 if (Isv2x16VT(VT) || VT == MVT::v4i8) 2862 return SDValue(); 2863 2864 if (VT.isVector()) 2865 return LowerSTOREVector(Op, DAG); 2866 2867 return SDValue(); 2868 } 2869 2870 SDValue 2871 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2872 SDNode *N = Op.getNode(); 2873 SDValue Val = N->getOperand(1); 2874 SDLoc DL(N); 2875 EVT ValVT = Val.getValueType(); 2876 2877 if (ValVT.isVector()) { 2878 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2879 // legal. We can (and should) split that into 2 stores of <2 x double> here 2880 // but I'm leaving that as a TODO for now. 2881 if (!ValVT.isSimple()) 2882 return SDValue(); 2883 switch (ValVT.getSimpleVT().SimpleTy) { 2884 default: 2885 return SDValue(); 2886 case MVT::v2i8: 2887 case MVT::v2i16: 2888 case MVT::v2i32: 2889 case MVT::v2i64: 2890 case MVT::v2f16: 2891 case MVT::v2bf16: 2892 case MVT::v2f32: 2893 case MVT::v2f64: 2894 case MVT::v4i8: 2895 case MVT::v4i16: 2896 case MVT::v4i32: 2897 case MVT::v4f16: 2898 case MVT::v4bf16: 2899 case MVT::v4f32: 2900 case MVT::v8f16: // <4 x f16x2> 2901 case MVT::v8bf16: // <4 x bf16x2> 2902 case MVT::v8i16: // <4 x i16x2> 2903 // This is a "native" vector type 2904 break; 2905 } 2906 2907 MemSDNode *MemSD = cast<MemSDNode>(N); 2908 const DataLayout &TD = DAG.getDataLayout(); 2909 2910 Align Alignment = MemSD->getAlign(); 2911 Align PrefAlign = 2912 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2913 if (Alignment < PrefAlign) { 2914 // This store is not sufficiently aligned, so bail out and let this vector 2915 // store be scalarized. Note that we may still be able to emit smaller 2916 // vector stores. For example, if we are storing a <4 x float> with an 2917 // alignment of 8, this check will fail but the legalizer will try again 2918 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2919 return SDValue(); 2920 } 2921 2922 unsigned Opcode = 0; 2923 EVT EltVT = ValVT.getVectorElementType(); 2924 unsigned NumElts = ValVT.getVectorNumElements(); 2925 2926 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2927 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2928 // stored type to i16 and propagate the "real" type as the memory type. 2929 bool NeedExt = false; 2930 if (EltVT.getSizeInBits() < 16) 2931 NeedExt = true; 2932 2933 bool StoreF16x2 = false; 2934 switch (NumElts) { 2935 default: 2936 return SDValue(); 2937 case 2: 2938 Opcode = NVPTXISD::StoreV2; 2939 break; 2940 case 4: 2941 Opcode = NVPTXISD::StoreV4; 2942 break; 2943 case 8: 2944 // v8f16 is a special case. PTX doesn't have st.v8.f16 2945 // instruction. Instead, we split the vector into v2f16 chunks and 2946 // store them with st.v4.b32. 2947 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); 2948 Opcode = NVPTXISD::StoreV4; 2949 StoreF16x2 = true; 2950 break; 2951 } 2952 2953 SmallVector<SDValue, 8> Ops; 2954 2955 // First is the chain 2956 Ops.push_back(N->getOperand(0)); 2957 2958 if (StoreF16x2) { 2959 // Combine f16,f16 -> v2f16 2960 NumElts /= 2; 2961 for (unsigned i = 0; i < NumElts; ++i) { 2962 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2963 DAG.getIntPtrConstant(i * 2, DL)); 2964 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2965 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2966 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 2967 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 2968 Ops.push_back(V2); 2969 } 2970 } else { 2971 // Then the split values 2972 for (unsigned i = 0; i < NumElts; ++i) { 2973 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2974 DAG.getIntPtrConstant(i, DL)); 2975 if (NeedExt) 2976 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2977 Ops.push_back(ExtVal); 2978 } 2979 } 2980 2981 // Then any remaining arguments 2982 Ops.append(N->op_begin() + 2, N->op_end()); 2983 2984 SDValue NewSt = 2985 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2986 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2987 2988 // return DCI.CombineTo(N, NewSt, true); 2989 return NewSt; 2990 } 2991 2992 return SDValue(); 2993 } 2994 2995 // st i1 v, addr 2996 // => 2997 // v1 = zxt v to i16 2998 // st.u8 i16, addr 2999 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 3000 SDNode *Node = Op.getNode(); 3001 SDLoc dl(Node); 3002 StoreSDNode *ST = cast<StoreSDNode>(Node); 3003 SDValue Tmp1 = ST->getChain(); 3004 SDValue Tmp2 = ST->getBasePtr(); 3005 SDValue Tmp3 = ST->getValue(); 3006 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 3007 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 3008 SDValue Result = 3009 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 3010 ST->getAlign(), ST->getMemOperand()->getFlags()); 3011 return Result; 3012 } 3013 3014 // This creates target external symbol for a function parameter. 3015 // Name of the symbol is composed from its index and the function name. 3016 // Negative index corresponds to special parameter (unsized array) used for 3017 // passing variable arguments. 3018 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 3019 EVT v) const { 3020 StringRef SavedStr = nvTM->getStrPool().save( 3021 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 3022 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 3023 } 3024 3025 SDValue NVPTXTargetLowering::LowerFormalArguments( 3026 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3027 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3028 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3029 MachineFunction &MF = DAG.getMachineFunction(); 3030 const DataLayout &DL = DAG.getDataLayout(); 3031 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3032 3033 const Function *F = &MF.getFunction(); 3034 const AttributeList &PAL = F->getAttributes(); 3035 const TargetLowering *TLI = STI.getTargetLowering(); 3036 3037 SDValue Root = DAG.getRoot(); 3038 std::vector<SDValue> OutChains; 3039 3040 bool isABI = (STI.getSmVersion() >= 20); 3041 assert(isABI && "Non-ABI compilation is not supported"); 3042 if (!isABI) 3043 return Chain; 3044 3045 std::vector<Type *> argTypes; 3046 std::vector<const Argument *> theArgs; 3047 for (const Argument &I : F->args()) { 3048 theArgs.push_back(&I); 3049 argTypes.push_back(I.getType()); 3050 } 3051 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3052 // Ins.size() will be larger 3053 // * if there is an aggregate argument with multiple fields (each field 3054 // showing up separately in Ins) 3055 // * if there is a vector argument with more than typical vector-length 3056 // elements (generally if more than 4) where each vector element is 3057 // individually present in Ins. 3058 // So a different index should be used for indexing into Ins. 3059 // See similar issue in LowerCall. 3060 unsigned InsIdx = 0; 3061 3062 int idx = 0; 3063 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 3064 Type *Ty = argTypes[i]; 3065 3066 if (theArgs[i]->use_empty()) { 3067 // argument is dead 3068 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 3069 SmallVector<EVT, 16> vtparts; 3070 3071 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 3072 if (vtparts.empty()) 3073 report_fatal_error("Empty parameter types are not supported"); 3074 3075 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 3076 ++parti) { 3077 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3078 ++InsIdx; 3079 } 3080 if (vtparts.size() > 0) 3081 --InsIdx; 3082 continue; 3083 } 3084 if (Ty->isVectorTy()) { 3085 EVT ObjectVT = getValueType(DL, Ty); 3086 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 3087 for (unsigned parti = 0; parti < NumRegs; ++parti) { 3088 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3089 ++InsIdx; 3090 } 3091 if (NumRegs > 0) 3092 --InsIdx; 3093 continue; 3094 } 3095 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3096 continue; 3097 } 3098 3099 // In the following cases, assign a node order of "idx+1" 3100 // to newly created nodes. The SDNodes for params have to 3101 // appear in the same order as their order of appearance 3102 // in the original function. "idx+1" holds that order. 3103 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 3104 bool aggregateIsPacked = false; 3105 if (StructType *STy = dyn_cast<StructType>(Ty)) 3106 aggregateIsPacked = STy->isPacked(); 3107 3108 SmallVector<EVT, 16> VTs; 3109 SmallVector<uint64_t, 16> Offsets; 3110 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 3111 if (VTs.empty()) 3112 report_fatal_error("Empty parameter types are not supported"); 3113 3114 auto VectorInfo = 3115 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 3116 3117 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3118 int VecIdx = -1; // Index of the first element of the current vector. 3119 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 3120 if (VectorInfo[parti] & PVF_FIRST) { 3121 assert(VecIdx == -1 && "Orphaned vector."); 3122 VecIdx = parti; 3123 } 3124 3125 // That's the last element of this store op. 3126 if (VectorInfo[parti] & PVF_LAST) { 3127 unsigned NumElts = parti - VecIdx + 1; 3128 EVT EltVT = VTs[parti]; 3129 // i1 is loaded/stored as i8. 3130 EVT LoadVT = EltVT; 3131 if (EltVT == MVT::i1) 3132 LoadVT = MVT::i8; 3133 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) 3134 // getLoad needs a vector type, but it can't handle 3135 // vectors which contain v2f16 or v2bf16 elements. So we must load 3136 // using i32 here and then bitcast back. 3137 LoadVT = MVT::i32; 3138 3139 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 3140 SDValue VecAddr = 3141 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 3142 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 3143 Value *srcValue = Constant::getNullValue(PointerType::get( 3144 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 3145 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 3146 MachinePointerInfo(srcValue), 3147 MaybeAlign(aggregateIsPacked ? 1 : 0), 3148 MachineMemOperand::MODereferenceable | 3149 MachineMemOperand::MOInvariant); 3150 if (P.getNode()) 3151 P.getNode()->setIROrder(idx + 1); 3152 for (unsigned j = 0; j < NumElts; ++j) { 3153 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 3154 DAG.getIntPtrConstant(j, dl)); 3155 // We've loaded i1 as an i8 and now must truncate it back to i1 3156 if (EltVT == MVT::i1) 3157 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 3158 // v2f16 was loaded as an i32. Now we must bitcast it back. 3159 else if (EltVT != LoadVT) 3160 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 3161 3162 // If a promoted integer type is used, truncate down to the original 3163 MVT PromotedVT; 3164 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 3165 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 3166 } 3167 3168 // Extend the element if necessary (e.g. an i8 is loaded 3169 // into an i16 register) 3170 if (Ins[InsIdx].VT.isInteger() && 3171 Ins[InsIdx].VT.getFixedSizeInBits() > 3172 LoadVT.getFixedSizeInBits()) { 3173 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 3174 : ISD::ZERO_EXTEND; 3175 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 3176 } 3177 InVals.push_back(Elt); 3178 } 3179 3180 // Reset vector tracking state. 3181 VecIdx = -1; 3182 } 3183 ++InsIdx; 3184 } 3185 if (VTs.size() > 0) 3186 --InsIdx; 3187 continue; 3188 } 3189 3190 // Param has ByVal attribute 3191 // Return MoveParam(param symbol). 3192 // Ideally, the param symbol can be returned directly, 3193 // but when SDNode builder decides to use it in a CopyToReg(), 3194 // machine instruction fails because TargetExternalSymbol 3195 // (not lowered) is target dependent, and CopyToReg assumes 3196 // the source is lowered. 3197 EVT ObjectVT = getValueType(DL, Ty); 3198 assert(ObjectVT == Ins[InsIdx].VT && 3199 "Ins type did not match function type"); 3200 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3201 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 3202 if (p.getNode()) 3203 p.getNode()->setIROrder(idx + 1); 3204 InVals.push_back(p); 3205 } 3206 3207 if (!OutChains.empty()) 3208 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 3209 3210 return Chain; 3211 } 3212 3213 SDValue 3214 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3215 bool isVarArg, 3216 const SmallVectorImpl<ISD::OutputArg> &Outs, 3217 const SmallVectorImpl<SDValue> &OutVals, 3218 const SDLoc &dl, SelectionDAG &DAG) const { 3219 const MachineFunction &MF = DAG.getMachineFunction(); 3220 const Function &F = MF.getFunction(); 3221 Type *RetTy = MF.getFunction().getReturnType(); 3222 3223 bool isABI = (STI.getSmVersion() >= 20); 3224 assert(isABI && "Non-ABI compilation is not supported"); 3225 if (!isABI) 3226 return Chain; 3227 3228 const DataLayout &DL = DAG.getDataLayout(); 3229 SmallVector<SDValue, 16> PromotedOutVals; 3230 SmallVector<EVT, 16> VTs; 3231 SmallVector<uint64_t, 16> Offsets; 3232 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 3233 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 3234 3235 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3236 SDValue PromotedOutVal = OutVals[i]; 3237 MVT PromotedVT; 3238 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 3239 VTs[i] = EVT(PromotedVT); 3240 } 3241 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 3242 llvm::ISD::NodeType Ext = 3243 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3244 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 3245 } 3246 PromotedOutVals.push_back(PromotedOutVal); 3247 } 3248 3249 auto VectorInfo = VectorizePTXValueVTs( 3250 VTs, Offsets, 3251 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 3252 : Align(1)); 3253 3254 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3255 // 32-bits are sign extended or zero extended, depending on whether 3256 // they are signed or unsigned types. 3257 bool ExtendIntegerRetVal = 3258 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 3259 3260 SmallVector<SDValue, 6> StoreOperands; 3261 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3262 // New load/store. Record chain and offset operands. 3263 if (VectorInfo[i] & PVF_FIRST) { 3264 assert(StoreOperands.empty() && "Orphaned operand list."); 3265 StoreOperands.push_back(Chain); 3266 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 3267 } 3268 3269 SDValue OutVal = OutVals[i]; 3270 SDValue RetVal = PromotedOutVals[i]; 3271 3272 if (ExtendIntegerRetVal) { 3273 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 3274 : ISD::ZERO_EXTEND, 3275 dl, MVT::i32, RetVal); 3276 } else if (OutVal.getValueSizeInBits() < 16) { 3277 // Use 16-bit registers for small load-stores as it's the 3278 // smallest general purpose register size supported by NVPTX. 3279 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 3280 } 3281 3282 // Record the value to return. 3283 StoreOperands.push_back(RetVal); 3284 3285 // That's the last element of this store op. 3286 if (VectorInfo[i] & PVF_LAST) { 3287 NVPTXISD::NodeType Op; 3288 unsigned NumElts = StoreOperands.size() - 2; 3289 switch (NumElts) { 3290 case 1: 3291 Op = NVPTXISD::StoreRetval; 3292 break; 3293 case 2: 3294 Op = NVPTXISD::StoreRetvalV2; 3295 break; 3296 case 4: 3297 Op = NVPTXISD::StoreRetvalV4; 3298 break; 3299 default: 3300 llvm_unreachable("Invalid vector info."); 3301 } 3302 3303 // Adjust type of load/store op if we've extended the scalar 3304 // return value. 3305 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3306 Chain = DAG.getMemIntrinsicNode( 3307 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 3308 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 3309 // Cleanup vector state. 3310 StoreOperands.clear(); 3311 } 3312 } 3313 3314 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 3315 } 3316 3317 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 3318 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 3319 SelectionDAG &DAG) const { 3320 if (Constraint.size() > 1) 3321 return; 3322 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3323 } 3324 3325 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 3326 switch (Intrinsic) { 3327 default: 3328 return 0; 3329 3330 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3331 return NVPTXISD::Tex1DFloatS32; 3332 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3333 return NVPTXISD::Tex1DFloatFloat; 3334 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3335 return NVPTXISD::Tex1DFloatFloatLevel; 3336 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3337 return NVPTXISD::Tex1DFloatFloatGrad; 3338 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3339 return NVPTXISD::Tex1DS32S32; 3340 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3341 return NVPTXISD::Tex1DS32Float; 3342 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3343 return NVPTXISD::Tex1DS32FloatLevel; 3344 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3345 return NVPTXISD::Tex1DS32FloatGrad; 3346 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3347 return NVPTXISD::Tex1DU32S32; 3348 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3349 return NVPTXISD::Tex1DU32Float; 3350 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3351 return NVPTXISD::Tex1DU32FloatLevel; 3352 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3353 return NVPTXISD::Tex1DU32FloatGrad; 3354 3355 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3356 return NVPTXISD::Tex1DArrayFloatS32; 3357 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3358 return NVPTXISD::Tex1DArrayFloatFloat; 3359 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3360 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3361 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3362 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3363 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3364 return NVPTXISD::Tex1DArrayS32S32; 3365 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3366 return NVPTXISD::Tex1DArrayS32Float; 3367 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3368 return NVPTXISD::Tex1DArrayS32FloatLevel; 3369 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3370 return NVPTXISD::Tex1DArrayS32FloatGrad; 3371 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3372 return NVPTXISD::Tex1DArrayU32S32; 3373 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3374 return NVPTXISD::Tex1DArrayU32Float; 3375 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3376 return NVPTXISD::Tex1DArrayU32FloatLevel; 3377 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3378 return NVPTXISD::Tex1DArrayU32FloatGrad; 3379 3380 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3381 return NVPTXISD::Tex2DFloatS32; 3382 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3383 return NVPTXISD::Tex2DFloatFloat; 3384 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3385 return NVPTXISD::Tex2DFloatFloatLevel; 3386 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3387 return NVPTXISD::Tex2DFloatFloatGrad; 3388 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3389 return NVPTXISD::Tex2DS32S32; 3390 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3391 return NVPTXISD::Tex2DS32Float; 3392 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3393 return NVPTXISD::Tex2DS32FloatLevel; 3394 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3395 return NVPTXISD::Tex2DS32FloatGrad; 3396 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3397 return NVPTXISD::Tex2DU32S32; 3398 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3399 return NVPTXISD::Tex2DU32Float; 3400 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3401 return NVPTXISD::Tex2DU32FloatLevel; 3402 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3403 return NVPTXISD::Tex2DU32FloatGrad; 3404 3405 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3406 return NVPTXISD::Tex2DArrayFloatS32; 3407 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3408 return NVPTXISD::Tex2DArrayFloatFloat; 3409 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3410 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3411 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3412 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3413 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3414 return NVPTXISD::Tex2DArrayS32S32; 3415 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3416 return NVPTXISD::Tex2DArrayS32Float; 3417 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3418 return NVPTXISD::Tex2DArrayS32FloatLevel; 3419 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3420 return NVPTXISD::Tex2DArrayS32FloatGrad; 3421 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3422 return NVPTXISD::Tex2DArrayU32S32; 3423 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3424 return NVPTXISD::Tex2DArrayU32Float; 3425 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3426 return NVPTXISD::Tex2DArrayU32FloatLevel; 3427 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3428 return NVPTXISD::Tex2DArrayU32FloatGrad; 3429 3430 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3431 return NVPTXISD::Tex3DFloatS32; 3432 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3433 return NVPTXISD::Tex3DFloatFloat; 3434 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3435 return NVPTXISD::Tex3DFloatFloatLevel; 3436 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3437 return NVPTXISD::Tex3DFloatFloatGrad; 3438 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3439 return NVPTXISD::Tex3DS32S32; 3440 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3441 return NVPTXISD::Tex3DS32Float; 3442 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3443 return NVPTXISD::Tex3DS32FloatLevel; 3444 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3445 return NVPTXISD::Tex3DS32FloatGrad; 3446 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3447 return NVPTXISD::Tex3DU32S32; 3448 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3449 return NVPTXISD::Tex3DU32Float; 3450 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3451 return NVPTXISD::Tex3DU32FloatLevel; 3452 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3453 return NVPTXISD::Tex3DU32FloatGrad; 3454 3455 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3456 return NVPTXISD::TexCubeFloatFloat; 3457 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3458 return NVPTXISD::TexCubeFloatFloatLevel; 3459 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3460 return NVPTXISD::TexCubeS32Float; 3461 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3462 return NVPTXISD::TexCubeS32FloatLevel; 3463 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3464 return NVPTXISD::TexCubeU32Float; 3465 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3466 return NVPTXISD::TexCubeU32FloatLevel; 3467 3468 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3469 return NVPTXISD::TexCubeArrayFloatFloat; 3470 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3471 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3472 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3473 return NVPTXISD::TexCubeArrayS32Float; 3474 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3475 return NVPTXISD::TexCubeArrayS32FloatLevel; 3476 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3477 return NVPTXISD::TexCubeArrayU32Float; 3478 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3479 return NVPTXISD::TexCubeArrayU32FloatLevel; 3480 3481 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3482 return NVPTXISD::Tld4R2DFloatFloat; 3483 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3484 return NVPTXISD::Tld4G2DFloatFloat; 3485 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3486 return NVPTXISD::Tld4B2DFloatFloat; 3487 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3488 return NVPTXISD::Tld4A2DFloatFloat; 3489 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3490 return NVPTXISD::Tld4R2DS64Float; 3491 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3492 return NVPTXISD::Tld4G2DS64Float; 3493 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3494 return NVPTXISD::Tld4B2DS64Float; 3495 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3496 return NVPTXISD::Tld4A2DS64Float; 3497 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3498 return NVPTXISD::Tld4R2DU64Float; 3499 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3500 return NVPTXISD::Tld4G2DU64Float; 3501 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3502 return NVPTXISD::Tld4B2DU64Float; 3503 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3504 return NVPTXISD::Tld4A2DU64Float; 3505 3506 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3507 return NVPTXISD::TexUnified1DFloatS32; 3508 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3509 return NVPTXISD::TexUnified1DFloatFloat; 3510 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3511 return NVPTXISD::TexUnified1DFloatFloatLevel; 3512 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3513 return NVPTXISD::TexUnified1DFloatFloatGrad; 3514 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3515 return NVPTXISD::TexUnified1DS32S32; 3516 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3517 return NVPTXISD::TexUnified1DS32Float; 3518 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3519 return NVPTXISD::TexUnified1DS32FloatLevel; 3520 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3521 return NVPTXISD::TexUnified1DS32FloatGrad; 3522 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3523 return NVPTXISD::TexUnified1DU32S32; 3524 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3525 return NVPTXISD::TexUnified1DU32Float; 3526 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3527 return NVPTXISD::TexUnified1DU32FloatLevel; 3528 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3529 return NVPTXISD::TexUnified1DU32FloatGrad; 3530 3531 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3532 return NVPTXISD::TexUnified1DArrayFloatS32; 3533 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3534 return NVPTXISD::TexUnified1DArrayFloatFloat; 3535 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3536 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3537 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3538 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3539 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3540 return NVPTXISD::TexUnified1DArrayS32S32; 3541 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3542 return NVPTXISD::TexUnified1DArrayS32Float; 3543 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3544 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3545 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3546 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3547 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3548 return NVPTXISD::TexUnified1DArrayU32S32; 3549 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3550 return NVPTXISD::TexUnified1DArrayU32Float; 3551 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3552 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3553 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3554 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3555 3556 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3557 return NVPTXISD::TexUnified2DFloatS32; 3558 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3559 return NVPTXISD::TexUnified2DFloatFloat; 3560 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3561 return NVPTXISD::TexUnified2DFloatFloatLevel; 3562 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3563 return NVPTXISD::TexUnified2DFloatFloatGrad; 3564 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3565 return NVPTXISD::TexUnified2DS32S32; 3566 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3567 return NVPTXISD::TexUnified2DS32Float; 3568 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3569 return NVPTXISD::TexUnified2DS32FloatLevel; 3570 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3571 return NVPTXISD::TexUnified2DS32FloatGrad; 3572 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3573 return NVPTXISD::TexUnified2DU32S32; 3574 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3575 return NVPTXISD::TexUnified2DU32Float; 3576 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3577 return NVPTXISD::TexUnified2DU32FloatLevel; 3578 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3579 return NVPTXISD::TexUnified2DU32FloatGrad; 3580 3581 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3582 return NVPTXISD::TexUnified2DArrayFloatS32; 3583 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3584 return NVPTXISD::TexUnified2DArrayFloatFloat; 3585 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3586 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3587 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3588 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3589 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3590 return NVPTXISD::TexUnified2DArrayS32S32; 3591 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3592 return NVPTXISD::TexUnified2DArrayS32Float; 3593 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3594 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3595 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3596 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3597 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3598 return NVPTXISD::TexUnified2DArrayU32S32; 3599 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3600 return NVPTXISD::TexUnified2DArrayU32Float; 3601 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3602 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3603 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3604 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3605 3606 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3607 return NVPTXISD::TexUnified3DFloatS32; 3608 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3609 return NVPTXISD::TexUnified3DFloatFloat; 3610 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3611 return NVPTXISD::TexUnified3DFloatFloatLevel; 3612 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3613 return NVPTXISD::TexUnified3DFloatFloatGrad; 3614 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3615 return NVPTXISD::TexUnified3DS32S32; 3616 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3617 return NVPTXISD::TexUnified3DS32Float; 3618 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3619 return NVPTXISD::TexUnified3DS32FloatLevel; 3620 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3621 return NVPTXISD::TexUnified3DS32FloatGrad; 3622 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3623 return NVPTXISD::TexUnified3DU32S32; 3624 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3625 return NVPTXISD::TexUnified3DU32Float; 3626 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3627 return NVPTXISD::TexUnified3DU32FloatLevel; 3628 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3629 return NVPTXISD::TexUnified3DU32FloatGrad; 3630 3631 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3632 return NVPTXISD::TexUnifiedCubeFloatFloat; 3633 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3634 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3635 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3636 return NVPTXISD::TexUnifiedCubeS32Float; 3637 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3638 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3639 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3640 return NVPTXISD::TexUnifiedCubeU32Float; 3641 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3642 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3643 3644 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3645 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3646 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3647 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3648 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3649 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3650 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3651 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3652 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3653 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3654 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3655 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3656 3657 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3658 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3659 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3660 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3661 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3662 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3663 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3664 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3665 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3666 return NVPTXISD::Tld4UnifiedR2DS64Float; 3667 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3668 return NVPTXISD::Tld4UnifiedG2DS64Float; 3669 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3670 return NVPTXISD::Tld4UnifiedB2DS64Float; 3671 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3672 return NVPTXISD::Tld4UnifiedA2DS64Float; 3673 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3674 return NVPTXISD::Tld4UnifiedR2DU64Float; 3675 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3676 return NVPTXISD::Tld4UnifiedG2DU64Float; 3677 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3678 return NVPTXISD::Tld4UnifiedB2DU64Float; 3679 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3680 return NVPTXISD::Tld4UnifiedA2DU64Float; 3681 } 3682 } 3683 3684 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3685 switch (Intrinsic) { 3686 default: 3687 return 0; 3688 case Intrinsic::nvvm_suld_1d_i8_clamp: 3689 return NVPTXISD::Suld1DI8Clamp; 3690 case Intrinsic::nvvm_suld_1d_i16_clamp: 3691 return NVPTXISD::Suld1DI16Clamp; 3692 case Intrinsic::nvvm_suld_1d_i32_clamp: 3693 return NVPTXISD::Suld1DI32Clamp; 3694 case Intrinsic::nvvm_suld_1d_i64_clamp: 3695 return NVPTXISD::Suld1DI64Clamp; 3696 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3697 return NVPTXISD::Suld1DV2I8Clamp; 3698 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3699 return NVPTXISD::Suld1DV2I16Clamp; 3700 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3701 return NVPTXISD::Suld1DV2I32Clamp; 3702 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3703 return NVPTXISD::Suld1DV2I64Clamp; 3704 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3705 return NVPTXISD::Suld1DV4I8Clamp; 3706 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3707 return NVPTXISD::Suld1DV4I16Clamp; 3708 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3709 return NVPTXISD::Suld1DV4I32Clamp; 3710 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3711 return NVPTXISD::Suld1DArrayI8Clamp; 3712 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3713 return NVPTXISD::Suld1DArrayI16Clamp; 3714 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3715 return NVPTXISD::Suld1DArrayI32Clamp; 3716 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3717 return NVPTXISD::Suld1DArrayI64Clamp; 3718 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3719 return NVPTXISD::Suld1DArrayV2I8Clamp; 3720 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3721 return NVPTXISD::Suld1DArrayV2I16Clamp; 3722 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3723 return NVPTXISD::Suld1DArrayV2I32Clamp; 3724 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3725 return NVPTXISD::Suld1DArrayV2I64Clamp; 3726 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3727 return NVPTXISD::Suld1DArrayV4I8Clamp; 3728 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3729 return NVPTXISD::Suld1DArrayV4I16Clamp; 3730 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3731 return NVPTXISD::Suld1DArrayV4I32Clamp; 3732 case Intrinsic::nvvm_suld_2d_i8_clamp: 3733 return NVPTXISD::Suld2DI8Clamp; 3734 case Intrinsic::nvvm_suld_2d_i16_clamp: 3735 return NVPTXISD::Suld2DI16Clamp; 3736 case Intrinsic::nvvm_suld_2d_i32_clamp: 3737 return NVPTXISD::Suld2DI32Clamp; 3738 case Intrinsic::nvvm_suld_2d_i64_clamp: 3739 return NVPTXISD::Suld2DI64Clamp; 3740 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3741 return NVPTXISD::Suld2DV2I8Clamp; 3742 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3743 return NVPTXISD::Suld2DV2I16Clamp; 3744 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3745 return NVPTXISD::Suld2DV2I32Clamp; 3746 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3747 return NVPTXISD::Suld2DV2I64Clamp; 3748 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3749 return NVPTXISD::Suld2DV4I8Clamp; 3750 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3751 return NVPTXISD::Suld2DV4I16Clamp; 3752 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3753 return NVPTXISD::Suld2DV4I32Clamp; 3754 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3755 return NVPTXISD::Suld2DArrayI8Clamp; 3756 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3757 return NVPTXISD::Suld2DArrayI16Clamp; 3758 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3759 return NVPTXISD::Suld2DArrayI32Clamp; 3760 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3761 return NVPTXISD::Suld2DArrayI64Clamp; 3762 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3763 return NVPTXISD::Suld2DArrayV2I8Clamp; 3764 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3765 return NVPTXISD::Suld2DArrayV2I16Clamp; 3766 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3767 return NVPTXISD::Suld2DArrayV2I32Clamp; 3768 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3769 return NVPTXISD::Suld2DArrayV2I64Clamp; 3770 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3771 return NVPTXISD::Suld2DArrayV4I8Clamp; 3772 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3773 return NVPTXISD::Suld2DArrayV4I16Clamp; 3774 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3775 return NVPTXISD::Suld2DArrayV4I32Clamp; 3776 case Intrinsic::nvvm_suld_3d_i8_clamp: 3777 return NVPTXISD::Suld3DI8Clamp; 3778 case Intrinsic::nvvm_suld_3d_i16_clamp: 3779 return NVPTXISD::Suld3DI16Clamp; 3780 case Intrinsic::nvvm_suld_3d_i32_clamp: 3781 return NVPTXISD::Suld3DI32Clamp; 3782 case Intrinsic::nvvm_suld_3d_i64_clamp: 3783 return NVPTXISD::Suld3DI64Clamp; 3784 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3785 return NVPTXISD::Suld3DV2I8Clamp; 3786 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3787 return NVPTXISD::Suld3DV2I16Clamp; 3788 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3789 return NVPTXISD::Suld3DV2I32Clamp; 3790 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3791 return NVPTXISD::Suld3DV2I64Clamp; 3792 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3793 return NVPTXISD::Suld3DV4I8Clamp; 3794 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3795 return NVPTXISD::Suld3DV4I16Clamp; 3796 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3797 return NVPTXISD::Suld3DV4I32Clamp; 3798 case Intrinsic::nvvm_suld_1d_i8_trap: 3799 return NVPTXISD::Suld1DI8Trap; 3800 case Intrinsic::nvvm_suld_1d_i16_trap: 3801 return NVPTXISD::Suld1DI16Trap; 3802 case Intrinsic::nvvm_suld_1d_i32_trap: 3803 return NVPTXISD::Suld1DI32Trap; 3804 case Intrinsic::nvvm_suld_1d_i64_trap: 3805 return NVPTXISD::Suld1DI64Trap; 3806 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3807 return NVPTXISD::Suld1DV2I8Trap; 3808 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3809 return NVPTXISD::Suld1DV2I16Trap; 3810 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3811 return NVPTXISD::Suld1DV2I32Trap; 3812 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3813 return NVPTXISD::Suld1DV2I64Trap; 3814 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3815 return NVPTXISD::Suld1DV4I8Trap; 3816 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3817 return NVPTXISD::Suld1DV4I16Trap; 3818 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3819 return NVPTXISD::Suld1DV4I32Trap; 3820 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3821 return NVPTXISD::Suld1DArrayI8Trap; 3822 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3823 return NVPTXISD::Suld1DArrayI16Trap; 3824 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3825 return NVPTXISD::Suld1DArrayI32Trap; 3826 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3827 return NVPTXISD::Suld1DArrayI64Trap; 3828 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3829 return NVPTXISD::Suld1DArrayV2I8Trap; 3830 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3831 return NVPTXISD::Suld1DArrayV2I16Trap; 3832 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3833 return NVPTXISD::Suld1DArrayV2I32Trap; 3834 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3835 return NVPTXISD::Suld1DArrayV2I64Trap; 3836 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3837 return NVPTXISD::Suld1DArrayV4I8Trap; 3838 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3839 return NVPTXISD::Suld1DArrayV4I16Trap; 3840 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3841 return NVPTXISD::Suld1DArrayV4I32Trap; 3842 case Intrinsic::nvvm_suld_2d_i8_trap: 3843 return NVPTXISD::Suld2DI8Trap; 3844 case Intrinsic::nvvm_suld_2d_i16_trap: 3845 return NVPTXISD::Suld2DI16Trap; 3846 case Intrinsic::nvvm_suld_2d_i32_trap: 3847 return NVPTXISD::Suld2DI32Trap; 3848 case Intrinsic::nvvm_suld_2d_i64_trap: 3849 return NVPTXISD::Suld2DI64Trap; 3850 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3851 return NVPTXISD::Suld2DV2I8Trap; 3852 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3853 return NVPTXISD::Suld2DV2I16Trap; 3854 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3855 return NVPTXISD::Suld2DV2I32Trap; 3856 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3857 return NVPTXISD::Suld2DV2I64Trap; 3858 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3859 return NVPTXISD::Suld2DV4I8Trap; 3860 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3861 return NVPTXISD::Suld2DV4I16Trap; 3862 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3863 return NVPTXISD::Suld2DV4I32Trap; 3864 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3865 return NVPTXISD::Suld2DArrayI8Trap; 3866 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3867 return NVPTXISD::Suld2DArrayI16Trap; 3868 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3869 return NVPTXISD::Suld2DArrayI32Trap; 3870 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3871 return NVPTXISD::Suld2DArrayI64Trap; 3872 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3873 return NVPTXISD::Suld2DArrayV2I8Trap; 3874 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3875 return NVPTXISD::Suld2DArrayV2I16Trap; 3876 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3877 return NVPTXISD::Suld2DArrayV2I32Trap; 3878 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3879 return NVPTXISD::Suld2DArrayV2I64Trap; 3880 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3881 return NVPTXISD::Suld2DArrayV4I8Trap; 3882 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3883 return NVPTXISD::Suld2DArrayV4I16Trap; 3884 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3885 return NVPTXISD::Suld2DArrayV4I32Trap; 3886 case Intrinsic::nvvm_suld_3d_i8_trap: 3887 return NVPTXISD::Suld3DI8Trap; 3888 case Intrinsic::nvvm_suld_3d_i16_trap: 3889 return NVPTXISD::Suld3DI16Trap; 3890 case Intrinsic::nvvm_suld_3d_i32_trap: 3891 return NVPTXISD::Suld3DI32Trap; 3892 case Intrinsic::nvvm_suld_3d_i64_trap: 3893 return NVPTXISD::Suld3DI64Trap; 3894 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3895 return NVPTXISD::Suld3DV2I8Trap; 3896 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3897 return NVPTXISD::Suld3DV2I16Trap; 3898 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3899 return NVPTXISD::Suld3DV2I32Trap; 3900 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3901 return NVPTXISD::Suld3DV2I64Trap; 3902 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3903 return NVPTXISD::Suld3DV4I8Trap; 3904 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3905 return NVPTXISD::Suld3DV4I16Trap; 3906 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3907 return NVPTXISD::Suld3DV4I32Trap; 3908 case Intrinsic::nvvm_suld_1d_i8_zero: 3909 return NVPTXISD::Suld1DI8Zero; 3910 case Intrinsic::nvvm_suld_1d_i16_zero: 3911 return NVPTXISD::Suld1DI16Zero; 3912 case Intrinsic::nvvm_suld_1d_i32_zero: 3913 return NVPTXISD::Suld1DI32Zero; 3914 case Intrinsic::nvvm_suld_1d_i64_zero: 3915 return NVPTXISD::Suld1DI64Zero; 3916 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3917 return NVPTXISD::Suld1DV2I8Zero; 3918 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3919 return NVPTXISD::Suld1DV2I16Zero; 3920 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3921 return NVPTXISD::Suld1DV2I32Zero; 3922 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3923 return NVPTXISD::Suld1DV2I64Zero; 3924 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3925 return NVPTXISD::Suld1DV4I8Zero; 3926 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3927 return NVPTXISD::Suld1DV4I16Zero; 3928 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3929 return NVPTXISD::Suld1DV4I32Zero; 3930 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3931 return NVPTXISD::Suld1DArrayI8Zero; 3932 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3933 return NVPTXISD::Suld1DArrayI16Zero; 3934 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3935 return NVPTXISD::Suld1DArrayI32Zero; 3936 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3937 return NVPTXISD::Suld1DArrayI64Zero; 3938 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3939 return NVPTXISD::Suld1DArrayV2I8Zero; 3940 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3941 return NVPTXISD::Suld1DArrayV2I16Zero; 3942 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3943 return NVPTXISD::Suld1DArrayV2I32Zero; 3944 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3945 return NVPTXISD::Suld1DArrayV2I64Zero; 3946 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3947 return NVPTXISD::Suld1DArrayV4I8Zero; 3948 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3949 return NVPTXISD::Suld1DArrayV4I16Zero; 3950 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3951 return NVPTXISD::Suld1DArrayV4I32Zero; 3952 case Intrinsic::nvvm_suld_2d_i8_zero: 3953 return NVPTXISD::Suld2DI8Zero; 3954 case Intrinsic::nvvm_suld_2d_i16_zero: 3955 return NVPTXISD::Suld2DI16Zero; 3956 case Intrinsic::nvvm_suld_2d_i32_zero: 3957 return NVPTXISD::Suld2DI32Zero; 3958 case Intrinsic::nvvm_suld_2d_i64_zero: 3959 return NVPTXISD::Suld2DI64Zero; 3960 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3961 return NVPTXISD::Suld2DV2I8Zero; 3962 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3963 return NVPTXISD::Suld2DV2I16Zero; 3964 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3965 return NVPTXISD::Suld2DV2I32Zero; 3966 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3967 return NVPTXISD::Suld2DV2I64Zero; 3968 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3969 return NVPTXISD::Suld2DV4I8Zero; 3970 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3971 return NVPTXISD::Suld2DV4I16Zero; 3972 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3973 return NVPTXISD::Suld2DV4I32Zero; 3974 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3975 return NVPTXISD::Suld2DArrayI8Zero; 3976 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3977 return NVPTXISD::Suld2DArrayI16Zero; 3978 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3979 return NVPTXISD::Suld2DArrayI32Zero; 3980 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3981 return NVPTXISD::Suld2DArrayI64Zero; 3982 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3983 return NVPTXISD::Suld2DArrayV2I8Zero; 3984 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3985 return NVPTXISD::Suld2DArrayV2I16Zero; 3986 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3987 return NVPTXISD::Suld2DArrayV2I32Zero; 3988 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3989 return NVPTXISD::Suld2DArrayV2I64Zero; 3990 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3991 return NVPTXISD::Suld2DArrayV4I8Zero; 3992 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3993 return NVPTXISD::Suld2DArrayV4I16Zero; 3994 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3995 return NVPTXISD::Suld2DArrayV4I32Zero; 3996 case Intrinsic::nvvm_suld_3d_i8_zero: 3997 return NVPTXISD::Suld3DI8Zero; 3998 case Intrinsic::nvvm_suld_3d_i16_zero: 3999 return NVPTXISD::Suld3DI16Zero; 4000 case Intrinsic::nvvm_suld_3d_i32_zero: 4001 return NVPTXISD::Suld3DI32Zero; 4002 case Intrinsic::nvvm_suld_3d_i64_zero: 4003 return NVPTXISD::Suld3DI64Zero; 4004 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4005 return NVPTXISD::Suld3DV2I8Zero; 4006 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4007 return NVPTXISD::Suld3DV2I16Zero; 4008 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4009 return NVPTXISD::Suld3DV2I32Zero; 4010 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4011 return NVPTXISD::Suld3DV2I64Zero; 4012 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4013 return NVPTXISD::Suld3DV4I8Zero; 4014 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4015 return NVPTXISD::Suld3DV4I16Zero; 4016 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4017 return NVPTXISD::Suld3DV4I32Zero; 4018 } 4019 } 4020 4021 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 4022 // TgtMemIntrinsic 4023 // because we need the information that is only available in the "Value" type 4024 // of destination 4025 // pointer. In particular, the address space information. 4026 bool NVPTXTargetLowering::getTgtMemIntrinsic( 4027 IntrinsicInfo &Info, const CallInst &I, 4028 MachineFunction &MF, unsigned Intrinsic) const { 4029 switch (Intrinsic) { 4030 default: 4031 return false; 4032 case Intrinsic::nvvm_match_all_sync_i32p: 4033 case Intrinsic::nvvm_match_all_sync_i64p: 4034 Info.opc = ISD::INTRINSIC_W_CHAIN; 4035 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 4036 // in order to model data exchange with other threads, but perform no real 4037 // memory accesses. 4038 Info.memVT = MVT::i1; 4039 4040 // Our result depends on both our and other thread's arguments. 4041 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4042 return true; 4043 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 4044 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 4045 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 4046 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 4047 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 4048 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 4049 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 4050 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 4051 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 4052 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 4053 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 4054 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 4055 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 4056 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 4057 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 4058 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 4059 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 4060 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 4061 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 4062 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 4063 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 4064 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 4065 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 4066 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 4067 Info.opc = ISD::INTRINSIC_W_CHAIN; 4068 Info.memVT = MVT::v8f16; 4069 Info.ptrVal = I.getArgOperand(0); 4070 Info.offset = 0; 4071 Info.flags = MachineMemOperand::MOLoad; 4072 Info.align = Align(16); 4073 return true; 4074 } 4075 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 4076 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 4077 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 4078 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 4079 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 4080 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 4081 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 4082 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 4083 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 4084 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 4085 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 4086 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 4087 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 4088 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 4089 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 4090 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 4091 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 4092 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 4093 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 4094 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 4095 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 4096 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 4097 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 4098 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 4099 Info.opc = ISD::INTRINSIC_W_CHAIN; 4100 Info.memVT = MVT::v2i32; 4101 Info.ptrVal = I.getArgOperand(0); 4102 Info.offset = 0; 4103 Info.flags = MachineMemOperand::MOLoad; 4104 Info.align = Align(8); 4105 return true; 4106 } 4107 4108 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 4109 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 4110 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 4111 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 4112 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 4113 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 4114 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 4115 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 4116 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 4117 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 4118 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 4119 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 4120 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 4121 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 4122 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 4123 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 4124 4125 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 4126 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 4127 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 4128 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 4129 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 4130 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 4131 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 4132 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 4133 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 4134 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 4135 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 4136 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 4137 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 4138 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 4139 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 4140 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 4141 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 4142 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 4143 Info.opc = ISD::INTRINSIC_W_CHAIN; 4144 Info.memVT = MVT::v4i32; 4145 Info.ptrVal = I.getArgOperand(0); 4146 Info.offset = 0; 4147 Info.flags = MachineMemOperand::MOLoad; 4148 Info.align = Align(16); 4149 return true; 4150 } 4151 4152 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 4153 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 4154 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 4155 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 4156 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 4157 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 4158 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 4159 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 4160 4161 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 4162 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 4163 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 4164 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 4165 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 4166 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 4167 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 4168 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 4169 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 4170 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 4171 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 4172 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 4173 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 4174 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 4175 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 4176 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 4177 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 4178 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 4179 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 4180 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 4181 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 4182 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 4183 Info.opc = ISD::INTRINSIC_W_CHAIN; 4184 Info.memVT = MVT::i32; 4185 Info.ptrVal = I.getArgOperand(0); 4186 Info.offset = 0; 4187 Info.flags = MachineMemOperand::MOLoad; 4188 Info.align = Align(4); 4189 return true; 4190 } 4191 4192 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 4193 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 4194 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 4195 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 4196 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 4197 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 4198 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 4199 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 4200 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 4201 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 4202 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 4203 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 4204 Info.opc = ISD::INTRINSIC_W_CHAIN; 4205 Info.memVT = MVT::v4f16; 4206 Info.ptrVal = I.getArgOperand(0); 4207 Info.offset = 0; 4208 Info.flags = MachineMemOperand::MOLoad; 4209 Info.align = Align(16); 4210 return true; 4211 } 4212 4213 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 4214 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 4215 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 4216 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 4217 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 4218 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 4219 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 4220 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 4221 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 4222 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 4223 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 4224 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 4225 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 4226 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 4227 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 4228 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 4229 Info.opc = ISD::INTRINSIC_W_CHAIN; 4230 Info.memVT = MVT::v8f32; 4231 Info.ptrVal = I.getArgOperand(0); 4232 Info.offset = 0; 4233 Info.flags = MachineMemOperand::MOLoad; 4234 Info.align = Align(16); 4235 return true; 4236 } 4237 4238 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 4239 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 4240 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 4241 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 4242 4243 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 4244 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 4245 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 4246 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 4247 4248 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 4249 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 4250 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 4251 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 4252 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 4253 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 4254 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 4255 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 4256 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 4257 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 4258 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 4259 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 4260 Info.opc = ISD::INTRINSIC_W_CHAIN; 4261 Info.memVT = MVT::v8i32; 4262 Info.ptrVal = I.getArgOperand(0); 4263 Info.offset = 0; 4264 Info.flags = MachineMemOperand::MOLoad; 4265 Info.align = Align(16); 4266 return true; 4267 } 4268 4269 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 4270 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 4271 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 4272 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 4273 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 4274 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 4275 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 4276 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 4277 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 4278 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 4279 Info.opc = ISD::INTRINSIC_W_CHAIN; 4280 Info.memVT = MVT::v2i32; 4281 Info.ptrVal = I.getArgOperand(0); 4282 Info.offset = 0; 4283 Info.flags = MachineMemOperand::MOLoad; 4284 Info.align = Align(8); 4285 return true; 4286 } 4287 4288 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 4289 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 4290 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 4291 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 4292 4293 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 4294 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 4295 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 4296 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 4297 Info.opc = ISD::INTRINSIC_W_CHAIN; 4298 Info.memVT = MVT::f64; 4299 Info.ptrVal = I.getArgOperand(0); 4300 Info.offset = 0; 4301 Info.flags = MachineMemOperand::MOLoad; 4302 Info.align = Align(8); 4303 return true; 4304 } 4305 4306 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 4307 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 4308 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 4309 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 4310 Info.opc = ISD::INTRINSIC_W_CHAIN; 4311 Info.memVT = MVT::v2f64; 4312 Info.ptrVal = I.getArgOperand(0); 4313 Info.offset = 0; 4314 Info.flags = MachineMemOperand::MOLoad; 4315 Info.align = Align(16); 4316 return true; 4317 } 4318 4319 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 4320 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 4321 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 4322 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 4323 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 4324 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 4325 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 4326 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 4327 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 4328 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 4329 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 4330 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 4331 Info.opc = ISD::INTRINSIC_VOID; 4332 Info.memVT = MVT::v4f16; 4333 Info.ptrVal = I.getArgOperand(0); 4334 Info.offset = 0; 4335 Info.flags = MachineMemOperand::MOStore; 4336 Info.align = Align(16); 4337 return true; 4338 } 4339 4340 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4341 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4342 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4343 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4344 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4345 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4346 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4347 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4348 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4349 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4350 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4351 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4352 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4353 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4354 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4355 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4356 Info.opc = ISD::INTRINSIC_VOID; 4357 Info.memVT = MVT::v8f32; 4358 Info.ptrVal = I.getArgOperand(0); 4359 Info.offset = 0; 4360 Info.flags = MachineMemOperand::MOStore; 4361 Info.align = Align(16); 4362 return true; 4363 } 4364 4365 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4366 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4367 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4368 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4369 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4370 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4371 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4372 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4373 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4374 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4375 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4376 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4377 Info.opc = ISD::INTRINSIC_VOID; 4378 Info.memVT = MVT::v8i32; 4379 Info.ptrVal = I.getArgOperand(0); 4380 Info.offset = 0; 4381 Info.flags = MachineMemOperand::MOStore; 4382 Info.align = Align(16); 4383 return true; 4384 } 4385 4386 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4387 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4388 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4389 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4390 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4391 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4392 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4393 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4394 Info.opc = ISD::INTRINSIC_VOID; 4395 Info.memVT = MVT::v2i32; 4396 Info.ptrVal = I.getArgOperand(0); 4397 Info.offset = 0; 4398 Info.flags = MachineMemOperand::MOStore; 4399 Info.align = Align(8); 4400 return true; 4401 } 4402 4403 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4404 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4405 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4406 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4407 Info.opc = ISD::INTRINSIC_VOID; 4408 Info.memVT = MVT::v2f64; 4409 Info.ptrVal = I.getArgOperand(0); 4410 Info.offset = 0; 4411 Info.flags = MachineMemOperand::MOStore; 4412 Info.align = Align(16); 4413 return true; 4414 } 4415 4416 case Intrinsic::nvvm_atomic_load_inc_32: 4417 case Intrinsic::nvvm_atomic_load_dec_32: 4418 4419 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4420 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4421 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4422 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4423 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4424 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4425 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4426 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4427 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4428 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4429 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4430 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4431 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4432 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4433 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4434 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4435 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4436 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4437 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4438 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4439 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4440 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4441 auto &DL = I.getModule()->getDataLayout(); 4442 Info.opc = ISD::INTRINSIC_W_CHAIN; 4443 Info.memVT = getValueType(DL, I.getType()); 4444 Info.ptrVal = I.getArgOperand(0); 4445 Info.offset = 0; 4446 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4447 Info.align.reset(); 4448 return true; 4449 } 4450 4451 case Intrinsic::nvvm_ldu_global_i: 4452 case Intrinsic::nvvm_ldu_global_f: 4453 case Intrinsic::nvvm_ldu_global_p: { 4454 auto &DL = I.getModule()->getDataLayout(); 4455 Info.opc = ISD::INTRINSIC_W_CHAIN; 4456 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4457 Info.memVT = getValueType(DL, I.getType()); 4458 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4459 Info.memVT = getPointerTy(DL); 4460 else 4461 Info.memVT = getValueType(DL, I.getType()); 4462 Info.ptrVal = I.getArgOperand(0); 4463 Info.offset = 0; 4464 Info.flags = MachineMemOperand::MOLoad; 4465 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4466 4467 return true; 4468 } 4469 case Intrinsic::nvvm_ldg_global_i: 4470 case Intrinsic::nvvm_ldg_global_f: 4471 case Intrinsic::nvvm_ldg_global_p: { 4472 auto &DL = I.getModule()->getDataLayout(); 4473 4474 Info.opc = ISD::INTRINSIC_W_CHAIN; 4475 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4476 Info.memVT = getValueType(DL, I.getType()); 4477 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4478 Info.memVT = getPointerTy(DL); 4479 else 4480 Info.memVT = getValueType(DL, I.getType()); 4481 Info.ptrVal = I.getArgOperand(0); 4482 Info.offset = 0; 4483 Info.flags = MachineMemOperand::MOLoad; 4484 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4485 4486 return true; 4487 } 4488 4489 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4490 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4491 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4492 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4493 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4494 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4495 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4496 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4497 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4498 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4499 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4500 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4501 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4502 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4503 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4504 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4505 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4506 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4507 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4508 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4509 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4510 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4511 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4512 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4513 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4514 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4515 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4516 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4517 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4518 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4519 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4520 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4521 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4522 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4523 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4524 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4525 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4526 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4527 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4528 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4529 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4530 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4531 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4532 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4533 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4534 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4535 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4536 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4537 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4538 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4539 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4540 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4541 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4542 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4543 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4544 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4545 Info.opc = getOpcForTextureInstr(Intrinsic); 4546 Info.memVT = MVT::v4f32; 4547 Info.ptrVal = nullptr; 4548 Info.offset = 0; 4549 Info.flags = MachineMemOperand::MOLoad; 4550 Info.align = Align(16); 4551 return true; 4552 4553 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4554 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4555 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4556 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4557 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4558 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4559 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4560 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4561 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4562 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4563 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4564 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4565 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4566 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4567 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4568 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4569 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4570 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4571 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4572 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4573 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4574 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4575 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4576 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4577 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4578 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4579 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4580 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4581 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4582 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4583 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4584 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4585 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4586 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4587 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4588 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4589 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4590 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4591 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4592 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4593 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4594 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4595 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4596 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4597 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4598 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4599 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4600 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4601 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4602 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4603 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4604 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4605 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4606 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4607 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4608 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4609 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4610 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4611 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4612 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4613 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4614 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4615 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4616 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4617 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4618 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4619 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4620 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4621 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4622 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4623 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4624 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4625 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4626 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4627 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4628 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4629 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4630 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4631 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4632 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4633 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4634 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4635 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4636 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4637 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4638 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4639 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4640 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4641 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4642 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4643 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4644 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4645 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4646 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4647 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4648 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4649 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4650 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4651 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4652 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4653 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4654 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4655 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4656 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4657 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4658 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4659 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4660 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4661 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4662 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4663 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4664 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4665 Info.opc = getOpcForTextureInstr(Intrinsic); 4666 Info.memVT = MVT::v4i32; 4667 Info.ptrVal = nullptr; 4668 Info.offset = 0; 4669 Info.flags = MachineMemOperand::MOLoad; 4670 Info.align = Align(16); 4671 return true; 4672 4673 case Intrinsic::nvvm_suld_1d_i8_clamp: 4674 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4675 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4676 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4677 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4678 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4679 case Intrinsic::nvvm_suld_2d_i8_clamp: 4680 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4681 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4682 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4683 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4684 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4685 case Intrinsic::nvvm_suld_3d_i8_clamp: 4686 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4687 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4688 case Intrinsic::nvvm_suld_1d_i8_trap: 4689 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4690 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4691 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4692 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4693 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4694 case Intrinsic::nvvm_suld_2d_i8_trap: 4695 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4696 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4697 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4698 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4699 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4700 case Intrinsic::nvvm_suld_3d_i8_trap: 4701 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4702 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4703 case Intrinsic::nvvm_suld_1d_i8_zero: 4704 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4705 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4706 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4707 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4708 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4709 case Intrinsic::nvvm_suld_2d_i8_zero: 4710 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4711 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4712 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4713 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4714 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4715 case Intrinsic::nvvm_suld_3d_i8_zero: 4716 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4717 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4718 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4719 Info.memVT = MVT::i8; 4720 Info.ptrVal = nullptr; 4721 Info.offset = 0; 4722 Info.flags = MachineMemOperand::MOLoad; 4723 Info.align = Align(16); 4724 return true; 4725 4726 case Intrinsic::nvvm_suld_1d_i16_clamp: 4727 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4728 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4729 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4730 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4731 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4732 case Intrinsic::nvvm_suld_2d_i16_clamp: 4733 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4734 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4735 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4736 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4737 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4738 case Intrinsic::nvvm_suld_3d_i16_clamp: 4739 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4740 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4741 case Intrinsic::nvvm_suld_1d_i16_trap: 4742 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4743 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4744 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4745 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4746 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4747 case Intrinsic::nvvm_suld_2d_i16_trap: 4748 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4749 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4750 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4751 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4752 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4753 case Intrinsic::nvvm_suld_3d_i16_trap: 4754 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4755 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4756 case Intrinsic::nvvm_suld_1d_i16_zero: 4757 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4758 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4759 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4760 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4761 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4762 case Intrinsic::nvvm_suld_2d_i16_zero: 4763 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4764 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4765 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4766 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4767 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4768 case Intrinsic::nvvm_suld_3d_i16_zero: 4769 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4770 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4771 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4772 Info.memVT = MVT::i16; 4773 Info.ptrVal = nullptr; 4774 Info.offset = 0; 4775 Info.flags = MachineMemOperand::MOLoad; 4776 Info.align = Align(16); 4777 return true; 4778 4779 case Intrinsic::nvvm_suld_1d_i32_clamp: 4780 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4781 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4782 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4783 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4784 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4785 case Intrinsic::nvvm_suld_2d_i32_clamp: 4786 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4787 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4788 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4789 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4790 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4791 case Intrinsic::nvvm_suld_3d_i32_clamp: 4792 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4793 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4794 case Intrinsic::nvvm_suld_1d_i32_trap: 4795 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4796 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4797 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4798 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4799 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4800 case Intrinsic::nvvm_suld_2d_i32_trap: 4801 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4802 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4803 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4804 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4805 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4806 case Intrinsic::nvvm_suld_3d_i32_trap: 4807 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4808 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4809 case Intrinsic::nvvm_suld_1d_i32_zero: 4810 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4811 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4812 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4813 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4814 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4815 case Intrinsic::nvvm_suld_2d_i32_zero: 4816 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4817 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4818 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4819 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4820 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4821 case Intrinsic::nvvm_suld_3d_i32_zero: 4822 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4823 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4824 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4825 Info.memVT = MVT::i32; 4826 Info.ptrVal = nullptr; 4827 Info.offset = 0; 4828 Info.flags = MachineMemOperand::MOLoad; 4829 Info.align = Align(16); 4830 return true; 4831 4832 case Intrinsic::nvvm_suld_1d_i64_clamp: 4833 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4834 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4835 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4836 case Intrinsic::nvvm_suld_2d_i64_clamp: 4837 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4838 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4839 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4840 case Intrinsic::nvvm_suld_3d_i64_clamp: 4841 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4842 case Intrinsic::nvvm_suld_1d_i64_trap: 4843 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4844 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4845 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4846 case Intrinsic::nvvm_suld_2d_i64_trap: 4847 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4848 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4849 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4850 case Intrinsic::nvvm_suld_3d_i64_trap: 4851 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4852 case Intrinsic::nvvm_suld_1d_i64_zero: 4853 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4854 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4855 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4856 case Intrinsic::nvvm_suld_2d_i64_zero: 4857 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4858 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4859 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4860 case Intrinsic::nvvm_suld_3d_i64_zero: 4861 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4862 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4863 Info.memVT = MVT::i64; 4864 Info.ptrVal = nullptr; 4865 Info.offset = 0; 4866 Info.flags = MachineMemOperand::MOLoad; 4867 Info.align = Align(16); 4868 return true; 4869 } 4870 return false; 4871 } 4872 4873 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4874 /// .param space, we may want to increase their alignment in a way that 4875 /// ensures that we can effectively vectorize their loads & stores. We can 4876 /// increase alignment only if the function has internal or has private 4877 /// linkage as for other linkage types callers may already rely on default 4878 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4879 /// ensures that alignment is 16 or greater. 4880 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4881 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4882 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4883 4884 // If a function has linkage different from internal or private, we 4885 // must use default ABI alignment as external users rely on it. Same 4886 // for a function that may be called from a function pointer. 4887 if (!F || !F->hasLocalLinkage() || 4888 F->hasAddressTaken(/*Users=*/nullptr, 4889 /*IgnoreCallbackUses=*/false, 4890 /*IgnoreAssumeLikeCalls=*/true, 4891 /*IgnoreLLVMUsed=*/true)) 4892 return Align(ABITypeAlign); 4893 4894 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4895 return Align(std::max(uint64_t(16), ABITypeAlign)); 4896 } 4897 4898 /// Helper for computing alignment of a device function byval parameter. 4899 Align NVPTXTargetLowering::getFunctionByValParamAlign( 4900 const Function *F, Type *ArgTy, Align InitialAlign, 4901 const DataLayout &DL) const { 4902 Align ArgAlign = InitialAlign; 4903 // Try to increase alignment to enhance vectorization options. 4904 if (F) 4905 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 4906 4907 // Old ptx versions have a bug. When PTX code takes address of 4908 // byval parameter with alignment < 4, ptxas generates code to 4909 // spill argument into memory. Alas on sm_50+ ptxas generates 4910 // SASS code that fails with misaligned access. To work around 4911 // the problem, make sure that we align byval parameters by at 4912 // least 4. This bug seems to be fixed at least starting from 4913 // ptxas > 9.0. 4914 // TODO: remove this after verifying the bug is not reproduced 4915 // on non-deprecated ptxas versions. 4916 if (ForceMinByValParamAlign) 4917 ArgAlign = std::max(ArgAlign, Align(4)); 4918 4919 return ArgAlign; 4920 } 4921 4922 // Helper for getting a function parameter name. Name is composed from 4923 // its index and the function name. Negative index corresponds to special 4924 // parameter (unsized array) used for passing variable arguments. 4925 std::string NVPTXTargetLowering::getParamName(const Function *F, 4926 int Idx) const { 4927 std::string ParamName; 4928 raw_string_ostream ParamStr(ParamName); 4929 4930 ParamStr << getTargetMachine().getSymbol(F)->getName(); 4931 if (Idx < 0) 4932 ParamStr << "_vararg"; 4933 else 4934 ParamStr << "_param_" << Idx; 4935 4936 return ParamName; 4937 } 4938 4939 /// isLegalAddressingMode - Return true if the addressing mode represented 4940 /// by AM is legal for this target, for a load/store of the specified type. 4941 /// Used to guide target specific optimizations, like loop strength reduction 4942 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4943 /// (CodeGenPrepare.cpp) 4944 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4945 const AddrMode &AM, Type *Ty, 4946 unsigned AS, Instruction *I) const { 4947 // AddrMode - This represents an addressing mode of: 4948 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4949 // 4950 // The legal address modes are 4951 // - [avar] 4952 // - [areg] 4953 // - [areg+immoff] 4954 // - [immAddr] 4955 4956 if (AM.BaseGV) { 4957 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4958 } 4959 4960 switch (AM.Scale) { 4961 case 0: // "r", "r+i" or "i" is allowed 4962 break; 4963 case 1: 4964 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4965 return false; 4966 // Otherwise we have r+i. 4967 break; 4968 default: 4969 // No scale > 1 is allowed 4970 return false; 4971 } 4972 return true; 4973 } 4974 4975 //===----------------------------------------------------------------------===// 4976 // NVPTX Inline Assembly Support 4977 //===----------------------------------------------------------------------===// 4978 4979 /// getConstraintType - Given a constraint letter, return the type of 4980 /// constraint it is for this target. 4981 NVPTXTargetLowering::ConstraintType 4982 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4983 if (Constraint.size() == 1) { 4984 switch (Constraint[0]) { 4985 default: 4986 break; 4987 case 'b': 4988 case 'r': 4989 case 'h': 4990 case 'c': 4991 case 'l': 4992 case 'f': 4993 case 'd': 4994 case '0': 4995 case 'N': 4996 return C_RegisterClass; 4997 } 4998 } 4999 return TargetLowering::getConstraintType(Constraint); 5000 } 5001 5002 std::pair<unsigned, const TargetRegisterClass *> 5003 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 5004 StringRef Constraint, 5005 MVT VT) const { 5006 if (Constraint.size() == 1) { 5007 switch (Constraint[0]) { 5008 case 'b': 5009 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 5010 case 'c': 5011 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5012 case 'h': 5013 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5014 case 'r': 5015 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 5016 case 'l': 5017 case 'N': 5018 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 5019 case 'f': 5020 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 5021 case 'd': 5022 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 5023 } 5024 } 5025 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5026 } 5027 5028 //===----------------------------------------------------------------------===// 5029 // NVPTX DAG Combining 5030 //===----------------------------------------------------------------------===// 5031 5032 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 5033 CodeGenOptLevel OptLevel) const { 5034 // Always honor command-line argument 5035 if (FMAContractLevelOpt.getNumOccurrences() > 0) 5036 return FMAContractLevelOpt > 0; 5037 5038 // Do not contract if we're not optimizing the code. 5039 if (OptLevel == CodeGenOptLevel::None) 5040 return false; 5041 5042 // Honor TargetOptions flags that explicitly say fusion is okay. 5043 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 5044 return true; 5045 5046 return allowUnsafeFPMath(MF); 5047 } 5048 5049 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 5050 // Honor TargetOptions flags that explicitly say unsafe math is okay. 5051 if (MF.getTarget().Options.UnsafeFPMath) 5052 return true; 5053 5054 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 5055 const Function &F = MF.getFunction(); 5056 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 5057 } 5058 5059 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 5060 /// operands N0 and N1. This is a helper for PerformADDCombine that is 5061 /// called with the default operands, and if that fails, with commuted 5062 /// operands. 5063 static SDValue PerformADDCombineWithOperands( 5064 SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, 5065 const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) { 5066 SelectionDAG &DAG = DCI.DAG; 5067 // Skip non-integer, non-scalar case 5068 EVT VT=N0.getValueType(); 5069 if (VT.isVector()) 5070 return SDValue(); 5071 5072 // fold (add (mul a, b), c) -> (mad a, b, c) 5073 // 5074 if (N0.getOpcode() == ISD::MUL) { 5075 assert (VT.isInteger()); 5076 // For integer: 5077 // Since integer multiply-add costs the same as integer multiply 5078 // but is more costly than integer add, do the fusion only when 5079 // the mul is only used in the add. 5080 if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 || 5081 !N0.getNode()->hasOneUse()) 5082 return SDValue(); 5083 5084 // Do the folding 5085 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 5086 N0.getOperand(0), N0.getOperand(1), N1); 5087 } 5088 else if (N0.getOpcode() == ISD::FMUL) { 5089 if (VT == MVT::f32 || VT == MVT::f64) { 5090 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 5091 &DAG.getTargetLoweringInfo()); 5092 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 5093 return SDValue(); 5094 5095 // For floating point: 5096 // Do the fusion only when the mul has less than 5 uses and all 5097 // are add. 5098 // The heuristic is that if a use is not an add, then that use 5099 // cannot be fused into fma, therefore mul is still needed anyway. 5100 // If there are more than 4 uses, even if they are all add, fusing 5101 // them will increase register pressue. 5102 // 5103 int numUses = 0; 5104 int nonAddCount = 0; 5105 for (const SDNode *User : N0.getNode()->uses()) { 5106 numUses++; 5107 if (User->getOpcode() != ISD::FADD) 5108 ++nonAddCount; 5109 } 5110 if (numUses >= 5) 5111 return SDValue(); 5112 if (nonAddCount) { 5113 int orderNo = N->getIROrder(); 5114 int orderNo2 = N0.getNode()->getIROrder(); 5115 // simple heuristics here for considering potential register 5116 // pressure, the logics here is that the differnce are used 5117 // to measure the distance between def and use, the longer distance 5118 // more likely cause register pressure. 5119 if (orderNo - orderNo2 < 500) 5120 return SDValue(); 5121 5122 // Now, check if at least one of the FMUL's operands is live beyond the node N, 5123 // which guarantees that the FMA will not increase register pressure at node N. 5124 bool opIsLive = false; 5125 const SDNode *left = N0.getOperand(0).getNode(); 5126 const SDNode *right = N0.getOperand(1).getNode(); 5127 5128 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 5129 opIsLive = true; 5130 5131 if (!opIsLive) 5132 for (const SDNode *User : left->uses()) { 5133 int orderNo3 = User->getIROrder(); 5134 if (orderNo3 > orderNo) { 5135 opIsLive = true; 5136 break; 5137 } 5138 } 5139 5140 if (!opIsLive) 5141 for (const SDNode *User : right->uses()) { 5142 int orderNo3 = User->getIROrder(); 5143 if (orderNo3 > orderNo) { 5144 opIsLive = true; 5145 break; 5146 } 5147 } 5148 5149 if (!opIsLive) 5150 return SDValue(); 5151 } 5152 5153 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 5154 N0.getOperand(0), N0.getOperand(1), N1); 5155 } 5156 } 5157 5158 return SDValue(); 5159 } 5160 5161 static SDValue PerformStoreRetvalCombine(SDNode *N) { 5162 // Operands from the 2nd to the last one are the values to be stored 5163 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 5164 if (!N->getOperand(I).isUndef()) 5165 return SDValue(); 5166 5167 // Operand 0 is the previous value in the chain. Cannot return EntryToken 5168 // as the previous value will become unused and eliminated later. 5169 return N->getOperand(0); 5170 } 5171 5172 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 5173 /// 5174 static SDValue PerformADDCombine(SDNode *N, 5175 TargetLowering::DAGCombinerInfo &DCI, 5176 const NVPTXSubtarget &Subtarget, 5177 CodeGenOptLevel OptLevel) { 5178 SDValue N0 = N->getOperand(0); 5179 SDValue N1 = N->getOperand(1); 5180 5181 // First try with the default operand order. 5182 if (SDValue Result = 5183 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 5184 return Result; 5185 5186 // If that didn't work, try again with the operands commuted. 5187 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 5188 } 5189 5190 static SDValue PerformANDCombine(SDNode *N, 5191 TargetLowering::DAGCombinerInfo &DCI) { 5192 // The type legalizer turns a vector load of i8 values into a zextload to i16 5193 // registers, optionally ANY_EXTENDs it (if target type is integer), 5194 // and ANDs off the high 8 bits. Since we turn this load into a 5195 // target-specific DAG node, the DAG combiner fails to eliminate these AND 5196 // nodes. Do that here. 5197 SDValue Val = N->getOperand(0); 5198 SDValue Mask = N->getOperand(1); 5199 5200 if (isa<ConstantSDNode>(Val)) { 5201 std::swap(Val, Mask); 5202 } 5203 5204 SDValue AExt; 5205 5206 // Convert BFE-> truncate i16 -> and 255 5207 // To just BFE-> truncate i16, as the value already has all the bits in the 5208 // right places. 5209 if (Val.getOpcode() == ISD::TRUNCATE) { 5210 SDValue BFE = Val.getOperand(0); 5211 if (BFE.getOpcode() != NVPTXISD::BFE) 5212 return SDValue(); 5213 5214 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); 5215 if (!BFEBits) 5216 return SDValue(); 5217 uint64_t BFEBitsVal = BFEBits->getZExtValue(); 5218 5219 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5220 if (!MaskCnst) { 5221 // Not an AND with a constant 5222 return SDValue(); 5223 } 5224 uint64_t MaskVal = MaskCnst->getZExtValue(); 5225 5226 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) 5227 return SDValue(); 5228 // If we get here, the AND is unnecessary. Just replace it with the trunc 5229 DCI.CombineTo(N, Val, false); 5230 } 5231 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 5232 if (Val.getOpcode() == ISD::ANY_EXTEND) { 5233 AExt = Val; 5234 Val = Val->getOperand(0); 5235 } 5236 5237 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 5238 Val = Val->getOperand(0); 5239 } 5240 5241 if (Val->getOpcode() == NVPTXISD::LoadV2 || 5242 Val->getOpcode() == NVPTXISD::LoadV4) { 5243 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5244 if (!MaskCnst) { 5245 // Not an AND with a constant 5246 return SDValue(); 5247 } 5248 5249 uint64_t MaskVal = MaskCnst->getZExtValue(); 5250 if (MaskVal != 0xff) { 5251 // Not an AND that chops off top 8 bits 5252 return SDValue(); 5253 } 5254 5255 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 5256 if (!Mem) { 5257 // Not a MemSDNode?!? 5258 return SDValue(); 5259 } 5260 5261 EVT MemVT = Mem->getMemoryVT(); 5262 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 5263 // We only handle the i8 case 5264 return SDValue(); 5265 } 5266 5267 unsigned ExtType = 5268 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 5269 getZExtValue(); 5270 if (ExtType == ISD::SEXTLOAD) { 5271 // If for some reason the load is a sextload, the and is needed to zero 5272 // out the high 8 bits 5273 return SDValue(); 5274 } 5275 5276 bool AddTo = false; 5277 if (AExt.getNode() != nullptr) { 5278 // Re-insert the ext as a zext. 5279 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5280 AExt.getValueType(), Val); 5281 AddTo = true; 5282 } 5283 5284 // If we get here, the AND is unnecessary. Just replace it with the load 5285 DCI.CombineTo(N, Val, AddTo); 5286 } 5287 5288 return SDValue(); 5289 } 5290 5291 static SDValue PerformREMCombine(SDNode *N, 5292 TargetLowering::DAGCombinerInfo &DCI, 5293 CodeGenOptLevel OptLevel) { 5294 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 5295 5296 // Don't do anything at less than -O2. 5297 if (OptLevel < CodeGenOptLevel::Default) 5298 return SDValue(); 5299 5300 SelectionDAG &DAG = DCI.DAG; 5301 SDLoc DL(N); 5302 EVT VT = N->getValueType(0); 5303 bool IsSigned = N->getOpcode() == ISD::SREM; 5304 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 5305 5306 const SDValue &Num = N->getOperand(0); 5307 const SDValue &Den = N->getOperand(1); 5308 5309 for (const SDNode *U : Num->uses()) { 5310 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 5311 U->getOperand(1) == Den) { 5312 // Num % Den -> Num - (Num / Den) * Den 5313 return DAG.getNode(ISD::SUB, DL, VT, Num, 5314 DAG.getNode(ISD::MUL, DL, VT, 5315 DAG.getNode(DivOpc, DL, VT, Num, Den), 5316 Den)); 5317 } 5318 } 5319 return SDValue(); 5320 } 5321 5322 enum OperandSignedness { 5323 Signed = 0, 5324 Unsigned, 5325 Unknown 5326 }; 5327 5328 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 5329 /// that can be demoted to \p OptSize bits without loss of information. The 5330 /// signedness of the operand, if determinable, is placed in \p S. 5331 static bool IsMulWideOperandDemotable(SDValue Op, 5332 unsigned OptSize, 5333 OperandSignedness &S) { 5334 S = Unknown; 5335 5336 if (Op.getOpcode() == ISD::SIGN_EXTEND || 5337 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 5338 EVT OrigVT = Op.getOperand(0).getValueType(); 5339 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5340 S = Signed; 5341 return true; 5342 } 5343 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 5344 EVT OrigVT = Op.getOperand(0).getValueType(); 5345 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5346 S = Unsigned; 5347 return true; 5348 } 5349 } 5350 5351 return false; 5352 } 5353 5354 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 5355 /// be demoted to \p OptSize bits without loss of information. If the operands 5356 /// contain a constant, it should appear as the RHS operand. The signedness of 5357 /// the operands is placed in \p IsSigned. 5358 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 5359 unsigned OptSize, 5360 bool &IsSigned) { 5361 OperandSignedness LHSSign; 5362 5363 // The LHS operand must be a demotable op 5364 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5365 return false; 5366 5367 // We should have been able to determine the signedness from the LHS 5368 if (LHSSign == Unknown) 5369 return false; 5370 5371 IsSigned = (LHSSign == Signed); 5372 5373 // The RHS can be a demotable op or a constant 5374 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5375 const APInt &Val = CI->getAPIntValue(); 5376 if (LHSSign == Unsigned) { 5377 return Val.isIntN(OptSize); 5378 } else { 5379 return Val.isSignedIntN(OptSize); 5380 } 5381 } else { 5382 OperandSignedness RHSSign; 5383 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5384 return false; 5385 5386 return LHSSign == RHSSign; 5387 } 5388 } 5389 5390 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5391 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5392 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5393 /// amount. 5394 static SDValue TryMULWIDECombine(SDNode *N, 5395 TargetLowering::DAGCombinerInfo &DCI) { 5396 EVT MulType = N->getValueType(0); 5397 if (MulType != MVT::i32 && MulType != MVT::i64) { 5398 return SDValue(); 5399 } 5400 5401 SDLoc DL(N); 5402 unsigned OptSize = MulType.getSizeInBits() >> 1; 5403 SDValue LHS = N->getOperand(0); 5404 SDValue RHS = N->getOperand(1); 5405 5406 // Canonicalize the multiply so the constant (if any) is on the right 5407 if (N->getOpcode() == ISD::MUL) { 5408 if (isa<ConstantSDNode>(LHS)) { 5409 std::swap(LHS, RHS); 5410 } 5411 } 5412 5413 // If we have a SHL, determine the actual multiply amount 5414 if (N->getOpcode() == ISD::SHL) { 5415 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5416 if (!ShlRHS) { 5417 return SDValue(); 5418 } 5419 5420 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5421 unsigned BitWidth = MulType.getSizeInBits(); 5422 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5423 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5424 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5425 } else { 5426 return SDValue(); 5427 } 5428 } 5429 5430 bool Signed; 5431 // Verify that our operands are demotable 5432 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5433 return SDValue(); 5434 } 5435 5436 EVT DemotedVT; 5437 if (MulType == MVT::i32) { 5438 DemotedVT = MVT::i16; 5439 } else { 5440 DemotedVT = MVT::i32; 5441 } 5442 5443 // Truncate the operands to the correct size. Note that these are just for 5444 // type consistency and will (likely) be eliminated in later phases. 5445 SDValue TruncLHS = 5446 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5447 SDValue TruncRHS = 5448 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5449 5450 unsigned Opc; 5451 if (Signed) { 5452 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5453 } else { 5454 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5455 } 5456 5457 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5458 } 5459 5460 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5461 static SDValue PerformMULCombine(SDNode *N, 5462 TargetLowering::DAGCombinerInfo &DCI, 5463 CodeGenOptLevel OptLevel) { 5464 if (OptLevel > CodeGenOptLevel::None) { 5465 // Try mul.wide combining at OptLevel > 0 5466 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5467 return Ret; 5468 } 5469 5470 return SDValue(); 5471 } 5472 5473 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5474 static SDValue PerformSHLCombine(SDNode *N, 5475 TargetLowering::DAGCombinerInfo &DCI, 5476 CodeGenOptLevel OptLevel) { 5477 if (OptLevel > CodeGenOptLevel::None) { 5478 // Try mul.wide combining at OptLevel > 0 5479 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5480 return Ret; 5481 } 5482 5483 return SDValue(); 5484 } 5485 5486 static SDValue PerformSETCCCombine(SDNode *N, 5487 TargetLowering::DAGCombinerInfo &DCI, 5488 unsigned int SmVersion) { 5489 EVT CCType = N->getValueType(0); 5490 SDValue A = N->getOperand(0); 5491 SDValue B = N->getOperand(1); 5492 5493 EVT AType = A.getValueType(); 5494 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) 5495 return SDValue(); 5496 5497 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) 5498 return SDValue(); 5499 5500 SDLoc DL(N); 5501 // setp.f16x2 returns two scalar predicates, which we need to 5502 // convert back to v2i1. The returned result will be scalarized by 5503 // the legalizer, but the comparison will remain a single vector 5504 // instruction. 5505 SDValue CCNode = DCI.DAG.getNode( 5506 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 5507 : NVPTXISD::SETP_BF16X2, 5508 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); 5509 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5510 CCNode.getValue(1)); 5511 } 5512 5513 static SDValue PerformEXTRACTCombine(SDNode *N, 5514 TargetLowering::DAGCombinerInfo &DCI) { 5515 SDValue Vector = N->getOperand(0); 5516 SDLoc DL(N); 5517 EVT VectorVT = Vector.getValueType(); 5518 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && 5519 IsPTXVectorType(VectorVT.getSimpleVT())) 5520 return SDValue(); // Native vector loads already combine nicely w/ 5521 // extract_vector_elt, except for v4i8. 5522 // Don't mess with singletons or v2*16 types, we already handle them OK. 5523 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || 5524 VectorVT == MVT::v4i8) 5525 return SDValue(); 5526 5527 uint64_t VectorBits = VectorVT.getSizeInBits(); 5528 // We only handle the types we can extract in-register. 5529 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) 5530 return SDValue(); 5531 5532 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5533 // Index == 0 is handled by generic DAG combiner. 5534 if (!Index || Index->getZExtValue() == 0) 5535 return SDValue(); 5536 5537 MVT IVT = MVT::getIntegerVT(VectorBits); 5538 EVT EltVT = VectorVT.getVectorElementType(); 5539 EVT EltIVT = EltVT.changeTypeToInteger(); 5540 uint64_t EltBits = EltVT.getScalarSizeInBits(); 5541 5542 SDValue Result = DCI.DAG.getNode( 5543 ISD::TRUNCATE, DL, EltIVT, 5544 DCI.DAG.getNode( 5545 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector), 5546 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); 5547 5548 // If element has non-integer type, bitcast it back to the expected type. 5549 if (EltVT != EltIVT) 5550 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); 5551 // Past legalizer, we may need to extent i8 -> i16 to match the register type. 5552 if (EltVT != N->getValueType(0)) 5553 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); 5554 5555 return Result; 5556 } 5557 5558 static SDValue PerformVSELECTCombine(SDNode *N, 5559 TargetLowering::DAGCombinerInfo &DCI) { 5560 SDValue VA = N->getOperand(1); 5561 EVT VectorVT = VA.getValueType(); 5562 if (VectorVT != MVT::v4i8) 5563 return SDValue(); 5564 5565 // We need to split vselect into individual per-element operations Because we 5566 // use BFE/BFI instruction for byte extraction/insertion, we do end up with 5567 // 32-bit values, so we may as well do comparison as i32 to avoid conversions 5568 // to/from i16 normally used for i8 values. 5569 SmallVector<SDValue, 4> E; 5570 SDLoc DL(N); 5571 SDValue VCond = N->getOperand(0); 5572 SDValue VB = N->getOperand(2); 5573 for (int I = 0; I < 4; ++I) { 5574 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, 5575 DCI.DAG.getConstant(I, DL, MVT::i32)); 5576 SDValue EA = DCI.DAG.getAnyExtOrTrunc( 5577 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, 5578 DCI.DAG.getConstant(I, DL, MVT::i32)), 5579 DL, MVT::i32); 5580 SDValue EB = DCI.DAG.getAnyExtOrTrunc( 5581 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, 5582 DCI.DAG.getConstant(I, DL, MVT::i32)), 5583 DL, MVT::i32); 5584 E.push_back(DCI.DAG.getAnyExtOrTrunc( 5585 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); 5586 } 5587 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); 5588 } 5589 5590 static SDValue PerformLOADCombine(SDNode *N, 5591 TargetLowering::DAGCombinerInfo &DCI) { 5592 SelectionDAG &DAG = DCI.DAG; 5593 LoadSDNode *LD = cast<LoadSDNode>(N); 5594 5595 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of 5596 // letting ReplaceLoadVector split it into smaller loads during legalization. 5597 // This is done at dag-combine1 time, so that vector operations with i8 5598 // elements can be optimised away instead of being needlessly split during 5599 // legalization, which involves storing to the stack and loading it back. 5600 EVT VT = N->getValueType(0); 5601 if (VT != MVT::v16i8) 5602 return SDValue(); 5603 5604 SDLoc DL(N); 5605 5606 // Create a v4i32 vector load operation, effectively <4 x v4i8>. 5607 unsigned Opc = NVPTXISD::LoadV4; 5608 EVT NewVT = MVT::v4i32; 5609 EVT EltVT = NewVT.getVectorElementType(); 5610 unsigned NumElts = NewVT.getVectorNumElements(); 5611 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; 5612 SDVTList RetVTList = DAG.getVTList(RetVTs); 5613 SmallVector<SDValue, 8> Ops(N->ops()); 5614 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5615 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, 5616 LD->getMemOperand()); 5617 SDValue NewChain = NewLoad.getValue(NumElts); 5618 5619 // Create a vector of the same type returned by the original load. 5620 SmallVector<SDValue, 4> Elts; 5621 for (unsigned i = 0; i < NumElts; i++) 5622 Elts.push_back(NewLoad.getValue(i)); 5623 return DCI.DAG.getMergeValues( 5624 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), 5625 NewChain}, 5626 DL); 5627 } 5628 5629 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 5630 DAGCombinerInfo &DCI) const { 5631 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); 5632 switch (N->getOpcode()) { 5633 default: break; 5634 case ISD::ADD: 5635 case ISD::FADD: 5636 return PerformADDCombine(N, DCI, STI, OptLevel); 5637 case ISD::MUL: 5638 return PerformMULCombine(N, DCI, OptLevel); 5639 case ISD::SHL: 5640 return PerformSHLCombine(N, DCI, OptLevel); 5641 case ISD::AND: 5642 return PerformANDCombine(N, DCI); 5643 case ISD::UREM: 5644 case ISD::SREM: 5645 return PerformREMCombine(N, DCI, OptLevel); 5646 case ISD::SETCC: 5647 return PerformSETCCCombine(N, DCI, STI.getSmVersion()); 5648 case ISD::LOAD: 5649 return PerformLOADCombine(N, DCI); 5650 case NVPTXISD::StoreRetval: 5651 case NVPTXISD::StoreRetvalV2: 5652 case NVPTXISD::StoreRetvalV4: 5653 return PerformStoreRetvalCombine(N); 5654 case ISD::EXTRACT_VECTOR_ELT: 5655 return PerformEXTRACTCombine(N, DCI); 5656 case ISD::VSELECT: 5657 return PerformVSELECTCombine(N, DCI); 5658 } 5659 return SDValue(); 5660 } 5661 5662 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5663 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 5664 SmallVectorImpl<SDValue> &Results) { 5665 EVT ResVT = N->getValueType(0); 5666 SDLoc DL(N); 5667 5668 assert(ResVT.isVector() && "Vector load must have vector type"); 5669 5670 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 5671 // legal. We can (and should) split that into 2 loads of <2 x double> here 5672 // but I'm leaving that as a TODO for now. 5673 assert(ResVT.isSimple() && "Can only handle simple types"); 5674 switch (ResVT.getSimpleVT().SimpleTy) { 5675 default: 5676 return; 5677 case MVT::v2i8: 5678 case MVT::v2i16: 5679 case MVT::v2i32: 5680 case MVT::v2i64: 5681 case MVT::v2f16: 5682 case MVT::v2f32: 5683 case MVT::v2f64: 5684 case MVT::v4i8: 5685 case MVT::v4i16: 5686 case MVT::v4i32: 5687 case MVT::v4f16: 5688 case MVT::v4f32: 5689 case MVT::v8f16: // <4 x f16x2> 5690 case MVT::v8bf16: // <4 x bf16x2> 5691 case MVT::v8i16: // <4 x i16x2> 5692 // This is a "native" vector type 5693 break; 5694 } 5695 5696 LoadSDNode *LD = cast<LoadSDNode>(N); 5697 5698 Align Alignment = LD->getAlign(); 5699 auto &TD = DAG.getDataLayout(); 5700 Align PrefAlign = 5701 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 5702 if (Alignment < PrefAlign) { 5703 // This load is not sufficiently aligned, so bail out and let this vector 5704 // load be scalarized. Note that we may still be able to emit smaller 5705 // vector loads. For example, if we are loading a <4 x float> with an 5706 // alignment of 8, this check will fail but the legalizer will try again 5707 // with 2 x <2 x float>, which will succeed with an alignment of 8. 5708 return; 5709 } 5710 5711 EVT EltVT = ResVT.getVectorElementType(); 5712 unsigned NumElts = ResVT.getVectorNumElements(); 5713 5714 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5715 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5716 // loaded type to i16 and propagate the "real" type as the memory type. 5717 bool NeedTrunc = false; 5718 if (EltVT.getSizeInBits() < 16) { 5719 EltVT = MVT::i16; 5720 NeedTrunc = true; 5721 } 5722 5723 unsigned Opcode = 0; 5724 SDVTList LdResVTs; 5725 bool Load16x2 = false; 5726 5727 switch (NumElts) { 5728 default: 5729 return; 5730 case 2: 5731 Opcode = NVPTXISD::LoadV2; 5732 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5733 break; 5734 case 4: { 5735 Opcode = NVPTXISD::LoadV4; 5736 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5737 LdResVTs = DAG.getVTList(ListVTs); 5738 break; 5739 } 5740 case 8: { 5741 // v8f16 is a special case. PTX doesn't have ld.v8.f16 5742 // instruction. Instead, we split the vector into v2f16 chunks and 5743 // load them with ld.v4.b32. 5744 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); 5745 Load16x2 = true; 5746 Opcode = NVPTXISD::LoadV4; 5747 EVT VVT; 5748 switch (EltVT.getSimpleVT().SimpleTy) { 5749 case MVT::f16: 5750 VVT = MVT::v2f16; 5751 break; 5752 case MVT::bf16: 5753 VVT = MVT::v2bf16; 5754 break; 5755 case MVT::i16: 5756 VVT = MVT::v2i16; 5757 break; 5758 default: 5759 llvm_unreachable("Unsupported v8 vector type."); 5760 } 5761 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 5762 LdResVTs = DAG.getVTList(ListVTs); 5763 break; 5764 } 5765 } 5766 5767 // Copy regular operands 5768 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5769 5770 // The select routine does not have access to the LoadSDNode instance, so 5771 // pass along the extension information 5772 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5773 5774 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5775 LD->getMemoryVT(), 5776 LD->getMemOperand()); 5777 5778 SmallVector<SDValue, 8> ScalarRes; 5779 if (Load16x2) { 5780 // Split v2f16 subvectors back into individual elements. 5781 NumElts /= 2; 5782 for (unsigned i = 0; i < NumElts; ++i) { 5783 SDValue SubVector = NewLD.getValue(i); 5784 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5785 DAG.getIntPtrConstant(0, DL)); 5786 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5787 DAG.getIntPtrConstant(1, DL)); 5788 ScalarRes.push_back(E0); 5789 ScalarRes.push_back(E1); 5790 } 5791 } else { 5792 for (unsigned i = 0; i < NumElts; ++i) { 5793 SDValue Res = NewLD.getValue(i); 5794 if (NeedTrunc) 5795 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5796 ScalarRes.push_back(Res); 5797 } 5798 } 5799 5800 SDValue LoadChain = NewLD.getValue(NumElts); 5801 5802 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5803 5804 Results.push_back(BuildVec); 5805 Results.push_back(LoadChain); 5806 } 5807 5808 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5809 SmallVectorImpl<SDValue> &Results) { 5810 SDValue Chain = N->getOperand(0); 5811 SDValue Intrin = N->getOperand(1); 5812 SDLoc DL(N); 5813 5814 // Get the intrinsic ID 5815 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); 5816 switch (IntrinNo) { 5817 default: 5818 return; 5819 case Intrinsic::nvvm_ldg_global_i: 5820 case Intrinsic::nvvm_ldg_global_f: 5821 case Intrinsic::nvvm_ldg_global_p: 5822 case Intrinsic::nvvm_ldu_global_i: 5823 case Intrinsic::nvvm_ldu_global_f: 5824 case Intrinsic::nvvm_ldu_global_p: { 5825 EVT ResVT = N->getValueType(0); 5826 5827 if (ResVT.isVector()) { 5828 // Vector LDG/LDU 5829 5830 unsigned NumElts = ResVT.getVectorNumElements(); 5831 EVT EltVT = ResVT.getVectorElementType(); 5832 5833 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5834 // legalization. 5835 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5836 // loaded type to i16 and propagate the "real" type as the memory type. 5837 bool NeedTrunc = false; 5838 if (EltVT.getSizeInBits() < 16) { 5839 EltVT = MVT::i16; 5840 NeedTrunc = true; 5841 } 5842 5843 unsigned Opcode = 0; 5844 SDVTList LdResVTs; 5845 5846 switch (NumElts) { 5847 default: 5848 return; 5849 case 2: 5850 switch (IntrinNo) { 5851 default: 5852 return; 5853 case Intrinsic::nvvm_ldg_global_i: 5854 case Intrinsic::nvvm_ldg_global_f: 5855 case Intrinsic::nvvm_ldg_global_p: 5856 Opcode = NVPTXISD::LDGV2; 5857 break; 5858 case Intrinsic::nvvm_ldu_global_i: 5859 case Intrinsic::nvvm_ldu_global_f: 5860 case Intrinsic::nvvm_ldu_global_p: 5861 Opcode = NVPTXISD::LDUV2; 5862 break; 5863 } 5864 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5865 break; 5866 case 4: { 5867 switch (IntrinNo) { 5868 default: 5869 return; 5870 case Intrinsic::nvvm_ldg_global_i: 5871 case Intrinsic::nvvm_ldg_global_f: 5872 case Intrinsic::nvvm_ldg_global_p: 5873 Opcode = NVPTXISD::LDGV4; 5874 break; 5875 case Intrinsic::nvvm_ldu_global_i: 5876 case Intrinsic::nvvm_ldu_global_f: 5877 case Intrinsic::nvvm_ldu_global_p: 5878 Opcode = NVPTXISD::LDUV4; 5879 break; 5880 } 5881 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5882 LdResVTs = DAG.getVTList(ListVTs); 5883 break; 5884 } 5885 } 5886 5887 SmallVector<SDValue, 8> OtherOps; 5888 5889 // Copy regular operands 5890 5891 OtherOps.push_back(Chain); // Chain 5892 // Skip operand 1 (intrinsic ID) 5893 // Others 5894 OtherOps.append(N->op_begin() + 2, N->op_end()); 5895 5896 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5897 5898 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5899 MemSD->getMemoryVT(), 5900 MemSD->getMemOperand()); 5901 5902 SmallVector<SDValue, 4> ScalarRes; 5903 5904 for (unsigned i = 0; i < NumElts; ++i) { 5905 SDValue Res = NewLD.getValue(i); 5906 if (NeedTrunc) 5907 Res = 5908 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5909 ScalarRes.push_back(Res); 5910 } 5911 5912 SDValue LoadChain = NewLD.getValue(NumElts); 5913 5914 SDValue BuildVec = 5915 DAG.getBuildVector(ResVT, DL, ScalarRes); 5916 5917 Results.push_back(BuildVec); 5918 Results.push_back(LoadChain); 5919 } else { 5920 // i8 LDG/LDU 5921 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5922 "Custom handling of non-i8 ldu/ldg?"); 5923 5924 // Just copy all operands as-is 5925 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5926 5927 // Force output to i16 5928 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5929 5930 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5931 5932 // We make sure the memory type is i8, which will be used during isel 5933 // to select the proper instruction. 5934 SDValue NewLD = 5935 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5936 MVT::i8, MemSD->getMemOperand()); 5937 5938 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5939 NewLD.getValue(0))); 5940 Results.push_back(NewLD.getValue(1)); 5941 } 5942 } 5943 } 5944 } 5945 5946 void NVPTXTargetLowering::ReplaceNodeResults( 5947 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5948 switch (N->getOpcode()) { 5949 default: 5950 report_fatal_error("Unhandled custom legalization"); 5951 case ISD::LOAD: 5952 ReplaceLoadVector(N, DAG, Results); 5953 return; 5954 case ISD::INTRINSIC_W_CHAIN: 5955 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5956 return; 5957 } 5958 } 5959 5960 NVPTXTargetLowering::AtomicExpansionKind 5961 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5962 Type *Ty = AI->getValOperand()->getType(); 5963 5964 if (AI->isFloatingPointOperation()) { 5965 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5966 if (Ty->isFloatTy()) 5967 return AtomicExpansionKind::None; 5968 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5969 return AtomicExpansionKind::None; 5970 } 5971 return AtomicExpansionKind::CmpXChg; 5972 } 5973 5974 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 5975 auto ITy = cast<llvm::IntegerType>(Ty); 5976 5977 switch (AI->getOperation()) { 5978 default: 5979 return AtomicExpansionKind::CmpXChg; 5980 case AtomicRMWInst::BinOp::And: 5981 case AtomicRMWInst::BinOp::Or: 5982 case AtomicRMWInst::BinOp::Xor: 5983 case AtomicRMWInst::BinOp::Xchg: 5984 switch (ITy->getBitWidth()) { 5985 case 8: 5986 case 16: 5987 return AtomicExpansionKind::CmpXChg; 5988 case 32: 5989 return AtomicExpansionKind::None; 5990 case 64: 5991 if (STI.hasAtomBitwise64()) 5992 return AtomicExpansionKind::None; 5993 return AtomicExpansionKind::CmpXChg; 5994 default: 5995 llvm_unreachable("unsupported width encountered"); 5996 } 5997 case AtomicRMWInst::BinOp::Add: 5998 case AtomicRMWInst::BinOp::Sub: 5999 case AtomicRMWInst::BinOp::Max: 6000 case AtomicRMWInst::BinOp::Min: 6001 case AtomicRMWInst::BinOp::UMax: 6002 case AtomicRMWInst::BinOp::UMin: 6003 switch (ITy->getBitWidth()) { 6004 case 8: 6005 case 16: 6006 return AtomicExpansionKind::CmpXChg; 6007 case 32: 6008 return AtomicExpansionKind::None; 6009 case 64: 6010 if (STI.hasAtomMinMax64()) 6011 return AtomicExpansionKind::None; 6012 return AtomicExpansionKind::CmpXChg; 6013 default: 6014 llvm_unreachable("unsupported width encountered"); 6015 } 6016 } 6017 6018 return AtomicExpansionKind::CmpXChg; 6019 } 6020 6021 // Pin NVPTXTargetObjectFile's vtables to this file. 6022 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 6023 6024 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 6025 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 6026 return getDataSection(); 6027 } 6028