1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/ISDOpcodes.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineMemOperand.h" 29 #include "llvm/CodeGen/MachineValueType.h" 30 #include "llvm/CodeGen/SelectionDAG.h" 31 #include "llvm/CodeGen/SelectionDAGNodes.h" 32 #include "llvm/CodeGen/TargetCallingConv.h" 33 #include "llvm/CodeGen/TargetLowering.h" 34 #include "llvm/CodeGen/ValueTypes.h" 35 #include "llvm/IR/Argument.h" 36 #include "llvm/IR/Attributes.h" 37 #include "llvm/IR/Constants.h" 38 #include "llvm/IR/DataLayout.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/DiagnosticInfo.h" 41 #include "llvm/IR/FPEnv.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalValue.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsNVPTX.h" 47 #include "llvm/IR/Module.h" 48 #include "llvm/IR/Type.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CodeGen.h" 52 #include "llvm/Support/CommandLine.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/raw_ostream.h" 55 #include "llvm/Target/TargetMachine.h" 56 #include "llvm/Target/TargetOptions.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <cmath> 60 #include <cstdint> 61 #include <iterator> 62 #include <sstream> 63 #include <string> 64 #include <utility> 65 #include <vector> 66 67 #define DEBUG_TYPE "nvptx-lower" 68 69 using namespace llvm; 70 71 static std::atomic<unsigned> GlobalUniqueCallSite; 72 73 static cl::opt<bool> sched4reg( 74 "nvptx-sched4reg", 75 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 76 77 static cl::opt<unsigned> FMAContractLevelOpt( 78 "nvptx-fma-level", cl::Hidden, 79 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 80 " 1: do it 2: do it aggressively"), 81 cl::init(2)); 82 83 static cl::opt<int> UsePrecDivF32( 84 "nvptx-prec-divf32", cl::Hidden, 85 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 86 " IEEE Compliant F32 div.rnd if available."), 87 cl::init(2)); 88 89 static cl::opt<bool> UsePrecSqrtF32( 90 "nvptx-prec-sqrtf32", cl::Hidden, 91 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 92 cl::init(true)); 93 94 static cl::opt<bool> ForceMinByValParamAlign( 95 "nvptx-force-min-byval-param-align", cl::Hidden, 96 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 97 " params of device functions."), 98 cl::init(false)); 99 100 int NVPTXTargetLowering::getDivF32Level() const { 101 if (UsePrecDivF32.getNumOccurrences() > 0) { 102 // If nvptx-prec-div32=N is used on the command-line, always honor it 103 return UsePrecDivF32; 104 } else { 105 // Otherwise, use div.approx if fast math is enabled 106 if (getTargetMachine().Options.UnsafeFPMath) 107 return 0; 108 else 109 return 2; 110 } 111 } 112 113 bool NVPTXTargetLowering::usePrecSqrtF32() const { 114 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 115 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 116 return UsePrecSqrtF32; 117 } else { 118 // Otherwise, use sqrt.approx if fast math is enabled 119 return !getTargetMachine().Options.UnsafeFPMath; 120 } 121 } 122 123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 124 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 125 DenormalMode::PreserveSign; 126 } 127 128 static bool IsPTXVectorType(MVT VT) { 129 switch (VT.SimpleTy) { 130 default: 131 return false; 132 case MVT::v2i1: 133 case MVT::v4i1: 134 case MVT::v2i8: 135 case MVT::v4i8: 136 case MVT::v2i16: 137 case MVT::v4i16: 138 case MVT::v8i16: // <4 x i16x2> 139 case MVT::v2i32: 140 case MVT::v4i32: 141 case MVT::v2i64: 142 case MVT::v2f16: 143 case MVT::v4f16: 144 case MVT::v8f16: // <4 x f16x2> 145 case MVT::v2bf16: 146 case MVT::v4bf16: 147 case MVT::v8bf16: // <4 x bf16x2> 148 case MVT::v2f32: 149 case MVT::v4f32: 150 case MVT::v2f64: 151 return true; 152 } 153 } 154 155 static bool Is16bitsType(MVT VT) { 156 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || 157 VT.SimpleTy == MVT::i16); 158 } 159 160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 161 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 162 /// into their primitive components. 163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 165 /// LowerCall, and LowerReturn. 166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 167 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 168 SmallVectorImpl<uint64_t> *Offsets = nullptr, 169 uint64_t StartingOffset = 0) { 170 SmallVector<EVT, 16> TempVTs; 171 SmallVector<uint64_t, 16> TempOffsets; 172 173 // Special case for i128 - decompose to (i64, i64) 174 if (Ty->isIntegerTy(128)) { 175 ValueVTs.push_back(EVT(MVT::i64)); 176 ValueVTs.push_back(EVT(MVT::i64)); 177 178 if (Offsets) { 179 Offsets->push_back(StartingOffset + 0); 180 Offsets->push_back(StartingOffset + 8); 181 } 182 183 return; 184 } 185 186 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 187 if (StructType *STy = dyn_cast<StructType>(Ty)) { 188 auto const *SL = DL.getStructLayout(STy); 189 auto ElementNum = 0; 190 for(auto *EI : STy->elements()) { 191 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 192 StartingOffset + SL->getElementOffset(ElementNum)); 193 ++ElementNum; 194 } 195 return; 196 } 197 198 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 199 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 200 EVT VT = TempVTs[i]; 201 uint64_t Off = TempOffsets[i]; 202 // Split vectors into individual elements, except for v2f16, which 203 // we will pass as a single scalar. 204 if (VT.isVector()) { 205 unsigned NumElts = VT.getVectorNumElements(); 206 EVT EltVT = VT.getVectorElementType(); 207 // Vectors with an even number of f16 elements will be passed to 208 // us as an array of v2f16/v2bf16 elements. We must match this so we 209 // stay in sync with Ins/Outs. 210 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 211 switch (EltVT.getSimpleVT().SimpleTy) { 212 case MVT::f16: 213 EltVT = MVT::v2f16; 214 break; 215 case MVT::bf16: 216 EltVT = MVT::v2bf16; 217 break; 218 case MVT::i16: 219 EltVT = MVT::v2i16; 220 break; 221 default: 222 llvm_unreachable("Unexpected type"); 223 } 224 NumElts /= 2; 225 } else if (EltVT.getSimpleVT() == MVT::i8 && 226 (NumElts % 4 == 0 || NumElts == 3)) { 227 // v*i8 are formally lowered as v4i8 228 EltVT = MVT::v4i8; 229 NumElts = (NumElts + 3) / 4; 230 } 231 for (unsigned j = 0; j != NumElts; ++j) { 232 ValueVTs.push_back(EltVT); 233 if (Offsets) 234 Offsets->push_back(Off + j * EltVT.getStoreSize()); 235 } 236 } else { 237 ValueVTs.push_back(VT); 238 if (Offsets) 239 Offsets->push_back(Off); 240 } 241 } 242 } 243 244 /// PromoteScalarIntegerPTX 245 /// Used to make sure the arguments/returns are suitable for passing 246 /// and promote them to a larger size if they're not. 247 /// 248 /// The promoted type is placed in \p PromoteVT if the function returns true. 249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 250 if (VT.isScalarInteger()) { 251 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 252 default: 253 llvm_unreachable( 254 "Promotion is not suitable for scalars of size larger than 64-bits"); 255 case 1: 256 *PromotedVT = MVT::i1; 257 break; 258 case 2: 259 case 4: 260 case 8: 261 *PromotedVT = MVT::i8; 262 break; 263 case 16: 264 *PromotedVT = MVT::i16; 265 break; 266 case 32: 267 *PromotedVT = MVT::i32; 268 break; 269 case 64: 270 *PromotedVT = MVT::i64; 271 break; 272 } 273 return EVT(*PromotedVT) != VT; 274 } 275 return false; 276 } 277 278 // Check whether we can merge loads/stores of some of the pieces of a 279 // flattened function parameter or return value into a single vector 280 // load/store. 281 // 282 // The flattened parameter is represented as a list of EVTs and 283 // offsets, and the whole structure is aligned to ParamAlignment. This 284 // function determines whether we can load/store pieces of the 285 // parameter starting at index Idx using a single vectorized op of 286 // size AccessSize. If so, it returns the number of param pieces 287 // covered by the vector op. Otherwise, it returns 1. 288 static unsigned CanMergeParamLoadStoresStartingAt( 289 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 290 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 291 292 // Can't vectorize if param alignment is not sufficient. 293 if (ParamAlignment < AccessSize) 294 return 1; 295 // Can't vectorize if offset is not aligned. 296 if (Offsets[Idx] & (AccessSize - 1)) 297 return 1; 298 299 EVT EltVT = ValueVTs[Idx]; 300 unsigned EltSize = EltVT.getStoreSize(); 301 302 // Element is too large to vectorize. 303 if (EltSize >= AccessSize) 304 return 1; 305 306 unsigned NumElts = AccessSize / EltSize; 307 // Can't vectorize if AccessBytes if not a multiple of EltSize. 308 if (AccessSize != EltSize * NumElts) 309 return 1; 310 311 // We don't have enough elements to vectorize. 312 if (Idx + NumElts > ValueVTs.size()) 313 return 1; 314 315 // PTX ISA can only deal with 2- and 4-element vector ops. 316 if (NumElts != 4 && NumElts != 2) 317 return 1; 318 319 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 320 // Types do not match. 321 if (ValueVTs[j] != EltVT) 322 return 1; 323 324 // Elements are not contiguous. 325 if (Offsets[j] - Offsets[j - 1] != EltSize) 326 return 1; 327 } 328 // OK. We can vectorize ValueVTs[i..i+NumElts) 329 return NumElts; 330 } 331 332 // Flags for tracking per-element vectorization state of loads/stores 333 // of a flattened function parameter or return value. 334 enum ParamVectorizationFlags { 335 PVF_INNER = 0x0, // Middle elements of a vector. 336 PVF_FIRST = 0x1, // First element of the vector. 337 PVF_LAST = 0x2, // Last element of the vector. 338 // Scalar is effectively a 1-element vector. 339 PVF_SCALAR = PVF_FIRST | PVF_LAST 340 }; 341 342 // Computes whether and how we can vectorize the loads/stores of a 343 // flattened function parameter or return value. 344 // 345 // The flattened parameter is represented as the list of ValueVTs and 346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 347 // of the same size as ValueVTs indicating how each piece should be 348 // loaded/stored (i.e. as a scalar, or as part of a vector 349 // load/store). 350 static SmallVector<ParamVectorizationFlags, 16> 351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 352 const SmallVectorImpl<uint64_t> &Offsets, 353 Align ParamAlignment, bool IsVAArg = false) { 354 // Set vector size to match ValueVTs and mark all elements as 355 // scalars by default. 356 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 357 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 358 359 if (IsVAArg) 360 return VectorInfo; 361 362 // Check what we can vectorize using 128/64/32-bit accesses. 363 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 364 // Skip elements we've already processed. 365 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 366 for (unsigned AccessSize : {16, 8, 4, 2}) { 367 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 368 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 369 // Mark vectorized elements. 370 switch (NumElts) { 371 default: 372 llvm_unreachable("Unexpected return value"); 373 case 1: 374 // Can't vectorize using this size, try next smaller size. 375 continue; 376 case 2: 377 assert(I + 1 < E && "Not enough elements."); 378 VectorInfo[I] = PVF_FIRST; 379 VectorInfo[I + 1] = PVF_LAST; 380 I += 1; 381 break; 382 case 4: 383 assert(I + 3 < E && "Not enough elements."); 384 VectorInfo[I] = PVF_FIRST; 385 VectorInfo[I + 1] = PVF_INNER; 386 VectorInfo[I + 2] = PVF_INNER; 387 VectorInfo[I + 3] = PVF_LAST; 388 I += 3; 389 break; 390 } 391 // Break out of the inner loop because we've already succeeded 392 // using largest possible AccessSize. 393 break; 394 } 395 } 396 return VectorInfo; 397 } 398 399 // NVPTXTargetLowering Constructor. 400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 401 const NVPTXSubtarget &STI) 402 : TargetLowering(TM), nvTM(&TM), STI(STI) { 403 // always lower memset, memcpy, and memmove intrinsics to load/store 404 // instructions, rather 405 // then generating calls to memset, mempcy or memmove. 406 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; 407 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; 408 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; 409 410 setBooleanContents(ZeroOrNegativeOneBooleanContent); 411 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 412 413 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 414 // condition branches. 415 setJumpIsExpensive(true); 416 417 // Wide divides are _very_ slow. Try to reduce the width of the divide if 418 // possible. 419 addBypassSlowDiv(64, 32); 420 421 // By default, use the Source scheduling 422 if (sched4reg) 423 setSchedulingPreference(Sched::RegPressure); 424 else 425 setSchedulingPreference(Sched::Source); 426 427 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 428 LegalizeAction NoF16Action) { 429 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 430 }; 431 432 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 433 LegalizeAction NoBF16Action) { 434 bool IsOpSupported = STI.hasBF16Math(); 435 // Few instructions are available on sm_90 only 436 switch(Op) { 437 case ISD::FADD: 438 case ISD::FMUL: 439 case ISD::FSUB: 440 case ISD::SELECT: 441 case ISD::SELECT_CC: 442 case ISD::SETCC: 443 case ISD::FEXP2: 444 case ISD::FCEIL: 445 case ISD::FFLOOR: 446 case ISD::FNEARBYINT: 447 case ISD::FRINT: 448 case ISD::FTRUNC: 449 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 450 break; 451 } 452 setOperationAction( 453 Op, VT, IsOpSupported ? Action : NoBF16Action); 454 }; 455 456 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 457 LegalizeAction NoI16x2Action) { 458 bool IsOpSupported = false; 459 // instructions are available on sm_90 only 460 switch (Op) { 461 case ISD::ADD: 462 case ISD::SMAX: 463 case ISD::SMIN: 464 case ISD::UMIN: 465 case ISD::UMAX: 466 case ISD::SUB: 467 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; 468 break; 469 } 470 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); 471 }; 472 473 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 474 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 475 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); 476 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); 477 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 478 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 479 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 480 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 481 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 482 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 483 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 484 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 485 486 // Conversion to/from FP16/FP16x2 is always legal. 487 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 489 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 490 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 491 492 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 493 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 494 495 // Conversion to/from BFP16/BFP16x2 is always legal. 496 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 497 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 498 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 500 501 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 502 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 503 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) 504 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); 505 506 // Conversion to/from i16/i16x2 is always legal. 507 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); 508 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 509 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); 510 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); 511 512 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); 513 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); 514 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); 515 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); 516 // Only logical ops can be done on v4i8 directly, others must be done 517 // elementwise. 518 setOperationAction( 519 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, 520 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, 521 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, 522 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, 523 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, 524 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, 525 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, 526 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, 527 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, 528 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, 529 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, 530 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, 531 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, 532 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, 533 ISD::USUBSAT}, 534 MVT::v4i8, Expand); 535 536 // Operations not directly supported by NVPTX. 537 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 538 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, 539 MVT::i32, MVT::i64}) { 540 setOperationAction(ISD::SELECT_CC, VT, Expand); 541 setOperationAction(ISD::BR_CC, VT, Expand); 542 } 543 544 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 545 // For others we will expand to a SHL/SRA pair. 546 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 547 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 548 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 549 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 550 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 551 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 552 553 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 554 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 555 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 556 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 557 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 558 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 559 560 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 561 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 562 563 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 564 // that don't have h/w rotation we lower them to multi-instruction assembly. 565 // See ROT*_sw in NVPTXIntrInfo.td 566 setOperationAction(ISD::ROTL, MVT::i64, Legal); 567 setOperationAction(ISD::ROTR, MVT::i64, Legal); 568 setOperationAction(ISD::ROTL, MVT::i32, Legal); 569 setOperationAction(ISD::ROTR, MVT::i32, Legal); 570 571 setOperationAction(ISD::ROTL, MVT::i16, Expand); 572 setOperationAction(ISD::ROTL, MVT::v2i16, Expand); 573 setOperationAction(ISD::ROTR, MVT::i16, Expand); 574 setOperationAction(ISD::ROTR, MVT::v2i16, Expand); 575 setOperationAction(ISD::ROTL, MVT::i8, Expand); 576 setOperationAction(ISD::ROTR, MVT::i8, Expand); 577 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 578 setOperationAction(ISD::BSWAP, MVT::v2i16, Expand); 579 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 580 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 581 582 // Indirect branch is not supported. 583 // This also disables Jump Table creation. 584 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 585 setOperationAction(ISD::BRIND, MVT::Other, Expand); 586 587 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 588 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 589 590 // We want to legalize constant related memmove and memcopy 591 // intrinsics. 592 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 593 594 // Turn FP extload into load/fpextend 595 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 596 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 597 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 598 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 600 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 601 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 602 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 605 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 606 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 607 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 610 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 611 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 612 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 614 // Turn FP truncstore into trunc + store. 615 // FIXME: vector types should also be expanded 616 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 617 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 618 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 619 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 620 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 621 622 // PTX does not support load / store predicate registers 623 setOperationAction(ISD::LOAD, MVT::i1, Custom); 624 setOperationAction(ISD::STORE, MVT::i1, Custom); 625 626 for (MVT VT : MVT::integer_valuetypes()) { 627 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 628 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 629 setTruncStoreAction(VT, MVT::i1, Expand); 630 } 631 632 // expand extload of vector of integers. 633 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, 634 MVT::v2i8, Expand); 635 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 636 637 // This is legal in NVPTX 638 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 639 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 640 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 641 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 642 643 // Lowering of DYNAMIC_STACKALLOC is unsupported. 644 // Custom lower to produce an error. 645 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 646 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 647 648 // TRAP can be lowered to PTX trap 649 setOperationAction(ISD::TRAP, MVT::Other, Legal); 650 651 // Register custom handling for vector loads/stores 652 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 653 if (IsPTXVectorType(VT)) { 654 setOperationAction(ISD::LOAD, VT, Custom); 655 setOperationAction(ISD::STORE, VT, Custom); 656 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 657 } 658 } 659 660 // Support varargs. 661 setOperationAction(ISD::VASTART, MVT::Other, Custom); 662 setOperationAction(ISD::VAARG, MVT::Other, Custom); 663 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 664 setOperationAction(ISD::VAEND, MVT::Other, Expand); 665 666 // Custom handling for i8 intrinsics 667 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 668 669 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 670 setOperationAction(ISD::ABS, Ty, Legal); 671 setOperationAction(ISD::SMIN, Ty, Legal); 672 setOperationAction(ISD::SMAX, Ty, Legal); 673 setOperationAction(ISD::UMIN, Ty, Legal); 674 setOperationAction(ISD::UMAX, Ty, Legal); 675 676 setOperationAction(ISD::CTPOP, Ty, Legal); 677 setOperationAction(ISD::CTLZ, Ty, Legal); 678 } 679 680 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); 681 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); 682 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); 683 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); 684 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); 685 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); 686 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); 687 688 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); 689 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); 690 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); 691 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); 692 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); 693 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); 694 695 // Other arithmetic and logic ops are unsupported. 696 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, 697 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, 698 ISD::SINT_TO_FP, ISD::UINT_TO_FP}, 699 MVT::v2i16, Expand); 700 701 setOperationAction(ISD::ADDC, MVT::i32, Legal); 702 setOperationAction(ISD::ADDE, MVT::i32, Legal); 703 setOperationAction(ISD::SUBC, MVT::i32, Legal); 704 setOperationAction(ISD::SUBE, MVT::i32, Legal); 705 if (STI.getPTXVersion() >= 43) { 706 setOperationAction(ISD::ADDC, MVT::i64, Legal); 707 setOperationAction(ISD::ADDE, MVT::i64, Legal); 708 setOperationAction(ISD::SUBC, MVT::i64, Legal); 709 setOperationAction(ISD::SUBE, MVT::i64, Legal); 710 } 711 712 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 713 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); 714 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 715 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 716 717 // PTX does not directly support SELP of i1, so promote to i32 first 718 setOperationAction(ISD::SELECT, MVT::i1, Custom); 719 720 // PTX cannot multiply two i64s in a single instruction. 721 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 722 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 723 724 // We have some custom DAG combine patterns for these nodes 725 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, 726 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, 727 ISD::VSELECT}); 728 729 // setcc for f16x2 and bf16x2 needs special handling to prevent 730 // legalizer's attempt to scalarize it due to v2i1 not being legal. 731 if (STI.allowFP16Math() || STI.hasBF16Math()) 732 setTargetDAGCombine(ISD::SETCC); 733 734 // Promote fp16 arithmetic if fp16 hardware isn't available or the 735 // user passed --nvptx-no-fp16-math. The flag is useful because, 736 // although sm_53+ GPUs have some sort of FP16 support in 737 // hardware, only sm_53 and sm_60 have full implementation. Others 738 // only have token amount of hardware and are likely to run faster 739 // by using fp32 units instead. 740 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 741 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 742 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 743 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 744 // bf16 must be promoted to f32. 745 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 746 if (getOperationAction(Op, MVT::bf16) == Promote) 747 AddPromotedToType(Op, MVT::bf16, MVT::f32); 748 } 749 750 // f16/f16x2 neg was introduced in PTX 60, SM_53. 751 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 752 STI.getPTXVersion() >= 60 && 753 STI.allowFP16Math(); 754 for (const auto &VT : {MVT::f16, MVT::v2f16}) 755 setOperationAction(ISD::FNEG, VT, 756 IsFP16FP16x2NegAvailable ? Legal : Expand); 757 758 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 759 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 760 // (would be) Library functions. 761 762 // These map to conversion instructions for scalar FP types. 763 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 764 ISD::FROUNDEVEN, ISD::FTRUNC}) { 765 setOperationAction(Op, MVT::f16, Legal); 766 setOperationAction(Op, MVT::f32, Legal); 767 setOperationAction(Op, MVT::f64, Legal); 768 setOperationAction(Op, MVT::v2f16, Expand); 769 setOperationAction(Op, MVT::v2bf16, Expand); 770 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 771 if (getOperationAction(Op, MVT::bf16) == Promote) 772 AddPromotedToType(Op, MVT::bf16, MVT::f32); 773 } 774 775 // sm_80 only has conversions between f32 and bf16. Custom lower all other 776 // bf16 conversions. 777 if (STI.hasBF16Math() && 778 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { 779 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { 780 setOperationAction( 781 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 782 VT, Custom); 783 } 784 } 785 786 setOperationAction(ISD::FROUND, MVT::f16, Promote); 787 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 788 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 789 setOperationAction(ISD::FROUND, MVT::f32, Custom); 790 setOperationAction(ISD::FROUND, MVT::f64, Custom); 791 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 792 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); 793 794 // 'Expand' implements FCOPYSIGN without calling an external library. 795 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 796 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 797 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 798 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 799 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 800 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 801 802 // These map to corresponding instructions for f32/f64. f16 must be 803 // promoted to f32. v2f16 is expanded to f16, which is then promoted 804 // to f32. 805 for (const auto &Op : 806 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { 807 setOperationAction(Op, MVT::f16, Promote); 808 setOperationAction(Op, MVT::f32, Legal); 809 setOperationAction(Op, MVT::f64, Legal); 810 setOperationAction(Op, MVT::v2f16, Expand); 811 setOperationAction(Op, MVT::v2bf16, Expand); 812 setOperationAction(Op, MVT::bf16, Promote); 813 AddPromotedToType(Op, MVT::bf16, MVT::f32); 814 } 815 for (const auto &Op : {ISD::FABS}) { 816 setOperationAction(Op, MVT::f16, Promote); 817 setOperationAction(Op, MVT::f32, Legal); 818 setOperationAction(Op, MVT::f64, Legal); 819 setOperationAction(Op, MVT::v2f16, Expand); 820 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 821 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 822 if (getOperationAction(Op, MVT::bf16) == Promote) 823 AddPromotedToType(Op, MVT::bf16, MVT::f32); 824 } 825 826 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 827 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 828 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 829 return IsAtLeastSm80 ? Legal : NotSm80Action; 830 }; 831 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 832 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 833 setOperationAction(Op, MVT::f32, Legal); 834 setOperationAction(Op, MVT::f64, Legal); 835 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 836 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 837 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 838 if (getOperationAction(Op, MVT::bf16) == Promote) 839 AddPromotedToType(Op, MVT::bf16, MVT::f32); 840 } 841 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 842 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 843 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 844 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 845 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 846 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 847 } 848 849 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 850 // No FPOW or FREM in PTX. 851 852 // Now deduce the information based on the above mentioned 853 // actions 854 computeRegisterProperties(STI.getRegisterInfo()); 855 856 setMinCmpXchgSizeInBits(32); 857 setMaxAtomicSizeInBitsSupported(64); 858 } 859 860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 861 switch ((NVPTXISD::NodeType)Opcode) { 862 case NVPTXISD::FIRST_NUMBER: 863 break; 864 case NVPTXISD::CALL: 865 return "NVPTXISD::CALL"; 866 case NVPTXISD::RET_GLUE: 867 return "NVPTXISD::RET_GLUE"; 868 case NVPTXISD::LOAD_PARAM: 869 return "NVPTXISD::LOAD_PARAM"; 870 case NVPTXISD::Wrapper: 871 return "NVPTXISD::Wrapper"; 872 case NVPTXISD::DeclareParam: 873 return "NVPTXISD::DeclareParam"; 874 case NVPTXISD::DeclareScalarParam: 875 return "NVPTXISD::DeclareScalarParam"; 876 case NVPTXISD::DeclareRet: 877 return "NVPTXISD::DeclareRet"; 878 case NVPTXISD::DeclareScalarRet: 879 return "NVPTXISD::DeclareScalarRet"; 880 case NVPTXISD::DeclareRetParam: 881 return "NVPTXISD::DeclareRetParam"; 882 case NVPTXISD::PrintCall: 883 return "NVPTXISD::PrintCall"; 884 case NVPTXISD::PrintConvergentCall: 885 return "NVPTXISD::PrintConvergentCall"; 886 case NVPTXISD::PrintCallUni: 887 return "NVPTXISD::PrintCallUni"; 888 case NVPTXISD::PrintConvergentCallUni: 889 return "NVPTXISD::PrintConvergentCallUni"; 890 case NVPTXISD::LoadParam: 891 return "NVPTXISD::LoadParam"; 892 case NVPTXISD::LoadParamV2: 893 return "NVPTXISD::LoadParamV2"; 894 case NVPTXISD::LoadParamV4: 895 return "NVPTXISD::LoadParamV4"; 896 case NVPTXISD::StoreParam: 897 return "NVPTXISD::StoreParam"; 898 case NVPTXISD::StoreParamV2: 899 return "NVPTXISD::StoreParamV2"; 900 case NVPTXISD::StoreParamV4: 901 return "NVPTXISD::StoreParamV4"; 902 case NVPTXISD::StoreParamS32: 903 return "NVPTXISD::StoreParamS32"; 904 case NVPTXISD::StoreParamU32: 905 return "NVPTXISD::StoreParamU32"; 906 case NVPTXISD::CallArgBegin: 907 return "NVPTXISD::CallArgBegin"; 908 case NVPTXISD::CallArg: 909 return "NVPTXISD::CallArg"; 910 case NVPTXISD::LastCallArg: 911 return "NVPTXISD::LastCallArg"; 912 case NVPTXISD::CallArgEnd: 913 return "NVPTXISD::CallArgEnd"; 914 case NVPTXISD::CallVoid: 915 return "NVPTXISD::CallVoid"; 916 case NVPTXISD::CallVal: 917 return "NVPTXISD::CallVal"; 918 case NVPTXISD::CallSymbol: 919 return "NVPTXISD::CallSymbol"; 920 case NVPTXISD::Prototype: 921 return "NVPTXISD::Prototype"; 922 case NVPTXISD::MoveParam: 923 return "NVPTXISD::MoveParam"; 924 case NVPTXISD::StoreRetval: 925 return "NVPTXISD::StoreRetval"; 926 case NVPTXISD::StoreRetvalV2: 927 return "NVPTXISD::StoreRetvalV2"; 928 case NVPTXISD::StoreRetvalV4: 929 return "NVPTXISD::StoreRetvalV4"; 930 case NVPTXISD::PseudoUseParam: 931 return "NVPTXISD::PseudoUseParam"; 932 case NVPTXISD::RETURN: 933 return "NVPTXISD::RETURN"; 934 case NVPTXISD::CallSeqBegin: 935 return "NVPTXISD::CallSeqBegin"; 936 case NVPTXISD::CallSeqEnd: 937 return "NVPTXISD::CallSeqEnd"; 938 case NVPTXISD::CallPrototype: 939 return "NVPTXISD::CallPrototype"; 940 case NVPTXISD::ProxyReg: 941 return "NVPTXISD::ProxyReg"; 942 case NVPTXISD::LoadV2: 943 return "NVPTXISD::LoadV2"; 944 case NVPTXISD::LoadV4: 945 return "NVPTXISD::LoadV4"; 946 case NVPTXISD::LDGV2: 947 return "NVPTXISD::LDGV2"; 948 case NVPTXISD::LDGV4: 949 return "NVPTXISD::LDGV4"; 950 case NVPTXISD::LDUV2: 951 return "NVPTXISD::LDUV2"; 952 case NVPTXISD::LDUV4: 953 return "NVPTXISD::LDUV4"; 954 case NVPTXISD::StoreV2: 955 return "NVPTXISD::StoreV2"; 956 case NVPTXISD::StoreV4: 957 return "NVPTXISD::StoreV4"; 958 case NVPTXISD::FUN_SHFL_CLAMP: 959 return "NVPTXISD::FUN_SHFL_CLAMP"; 960 case NVPTXISD::FUN_SHFR_CLAMP: 961 return "NVPTXISD::FUN_SHFR_CLAMP"; 962 case NVPTXISD::IMAD: 963 return "NVPTXISD::IMAD"; 964 case NVPTXISD::BFE: 965 return "NVPTXISD::BFE"; 966 case NVPTXISD::BFI: 967 return "NVPTXISD::BFI"; 968 case NVPTXISD::PRMT: 969 return "NVPTXISD::PRMT"; 970 case NVPTXISD::SETP_F16X2: 971 return "NVPTXISD::SETP_F16X2"; 972 case NVPTXISD::SETP_BF16X2: 973 return "NVPTXISD::SETP_BF16X2"; 974 case NVPTXISD::Dummy: 975 return "NVPTXISD::Dummy"; 976 case NVPTXISD::MUL_WIDE_SIGNED: 977 return "NVPTXISD::MUL_WIDE_SIGNED"; 978 case NVPTXISD::MUL_WIDE_UNSIGNED: 979 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 980 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 981 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 982 case NVPTXISD::Tex1DFloatFloatLevel: 983 return "NVPTXISD::Tex1DFloatFloatLevel"; 984 case NVPTXISD::Tex1DFloatFloatGrad: 985 return "NVPTXISD::Tex1DFloatFloatGrad"; 986 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 987 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 988 case NVPTXISD::Tex1DS32FloatLevel: 989 return "NVPTXISD::Tex1DS32FloatLevel"; 990 case NVPTXISD::Tex1DS32FloatGrad: 991 return "NVPTXISD::Tex1DS32FloatGrad"; 992 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 993 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 994 case NVPTXISD::Tex1DU32FloatLevel: 995 return "NVPTXISD::Tex1DU32FloatLevel"; 996 case NVPTXISD::Tex1DU32FloatGrad: 997 return "NVPTXISD::Tex1DU32FloatGrad"; 998 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 999 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 1000 case NVPTXISD::Tex1DArrayFloatFloatLevel: 1001 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 1002 case NVPTXISD::Tex1DArrayFloatFloatGrad: 1003 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 1004 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 1005 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 1006 case NVPTXISD::Tex1DArrayS32FloatLevel: 1007 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 1008 case NVPTXISD::Tex1DArrayS32FloatGrad: 1009 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 1010 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 1011 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 1012 case NVPTXISD::Tex1DArrayU32FloatLevel: 1013 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 1014 case NVPTXISD::Tex1DArrayU32FloatGrad: 1015 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 1016 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 1017 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 1018 case NVPTXISD::Tex2DFloatFloatLevel: 1019 return "NVPTXISD::Tex2DFloatFloatLevel"; 1020 case NVPTXISD::Tex2DFloatFloatGrad: 1021 return "NVPTXISD::Tex2DFloatFloatGrad"; 1022 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 1023 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 1024 case NVPTXISD::Tex2DS32FloatLevel: 1025 return "NVPTXISD::Tex2DS32FloatLevel"; 1026 case NVPTXISD::Tex2DS32FloatGrad: 1027 return "NVPTXISD::Tex2DS32FloatGrad"; 1028 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 1029 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 1030 case NVPTXISD::Tex2DU32FloatLevel: 1031 return "NVPTXISD::Tex2DU32FloatLevel"; 1032 case NVPTXISD::Tex2DU32FloatGrad: 1033 return "NVPTXISD::Tex2DU32FloatGrad"; 1034 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 1035 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 1036 case NVPTXISD::Tex2DArrayFloatFloatLevel: 1037 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 1038 case NVPTXISD::Tex2DArrayFloatFloatGrad: 1039 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 1040 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 1041 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 1042 case NVPTXISD::Tex2DArrayS32FloatLevel: 1043 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 1044 case NVPTXISD::Tex2DArrayS32FloatGrad: 1045 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 1046 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 1047 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 1048 case NVPTXISD::Tex2DArrayU32FloatLevel: 1049 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 1050 case NVPTXISD::Tex2DArrayU32FloatGrad: 1051 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 1052 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 1053 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 1054 case NVPTXISD::Tex3DFloatFloatLevel: 1055 return "NVPTXISD::Tex3DFloatFloatLevel"; 1056 case NVPTXISD::Tex3DFloatFloatGrad: 1057 return "NVPTXISD::Tex3DFloatFloatGrad"; 1058 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 1059 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 1060 case NVPTXISD::Tex3DS32FloatLevel: 1061 return "NVPTXISD::Tex3DS32FloatLevel"; 1062 case NVPTXISD::Tex3DS32FloatGrad: 1063 return "NVPTXISD::Tex3DS32FloatGrad"; 1064 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 1065 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 1066 case NVPTXISD::Tex3DU32FloatLevel: 1067 return "NVPTXISD::Tex3DU32FloatLevel"; 1068 case NVPTXISD::Tex3DU32FloatGrad: 1069 return "NVPTXISD::Tex3DU32FloatGrad"; 1070 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 1071 case NVPTXISD::TexCubeFloatFloatLevel: 1072 return "NVPTXISD::TexCubeFloatFloatLevel"; 1073 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 1074 case NVPTXISD::TexCubeS32FloatLevel: 1075 return "NVPTXISD::TexCubeS32FloatLevel"; 1076 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 1077 case NVPTXISD::TexCubeU32FloatLevel: 1078 return "NVPTXISD::TexCubeU32FloatLevel"; 1079 case NVPTXISD::TexCubeArrayFloatFloat: 1080 return "NVPTXISD::TexCubeArrayFloatFloat"; 1081 case NVPTXISD::TexCubeArrayFloatFloatLevel: 1082 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 1083 case NVPTXISD::TexCubeArrayS32Float: 1084 return "NVPTXISD::TexCubeArrayS32Float"; 1085 case NVPTXISD::TexCubeArrayS32FloatLevel: 1086 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 1087 case NVPTXISD::TexCubeArrayU32Float: 1088 return "NVPTXISD::TexCubeArrayU32Float"; 1089 case NVPTXISD::TexCubeArrayU32FloatLevel: 1090 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 1091 case NVPTXISD::Tld4R2DFloatFloat: 1092 return "NVPTXISD::Tld4R2DFloatFloat"; 1093 case NVPTXISD::Tld4G2DFloatFloat: 1094 return "NVPTXISD::Tld4G2DFloatFloat"; 1095 case NVPTXISD::Tld4B2DFloatFloat: 1096 return "NVPTXISD::Tld4B2DFloatFloat"; 1097 case NVPTXISD::Tld4A2DFloatFloat: 1098 return "NVPTXISD::Tld4A2DFloatFloat"; 1099 case NVPTXISD::Tld4R2DS64Float: 1100 return "NVPTXISD::Tld4R2DS64Float"; 1101 case NVPTXISD::Tld4G2DS64Float: 1102 return "NVPTXISD::Tld4G2DS64Float"; 1103 case NVPTXISD::Tld4B2DS64Float: 1104 return "NVPTXISD::Tld4B2DS64Float"; 1105 case NVPTXISD::Tld4A2DS64Float: 1106 return "NVPTXISD::Tld4A2DS64Float"; 1107 case NVPTXISD::Tld4R2DU64Float: 1108 return "NVPTXISD::Tld4R2DU64Float"; 1109 case NVPTXISD::Tld4G2DU64Float: 1110 return "NVPTXISD::Tld4G2DU64Float"; 1111 case NVPTXISD::Tld4B2DU64Float: 1112 return "NVPTXISD::Tld4B2DU64Float"; 1113 case NVPTXISD::Tld4A2DU64Float: 1114 return "NVPTXISD::Tld4A2DU64Float"; 1115 1116 case NVPTXISD::TexUnified1DFloatS32: 1117 return "NVPTXISD::TexUnified1DFloatS32"; 1118 case NVPTXISD::TexUnified1DFloatFloat: 1119 return "NVPTXISD::TexUnified1DFloatFloat"; 1120 case NVPTXISD::TexUnified1DFloatFloatLevel: 1121 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 1122 case NVPTXISD::TexUnified1DFloatFloatGrad: 1123 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 1124 case NVPTXISD::TexUnified1DS32S32: 1125 return "NVPTXISD::TexUnified1DS32S32"; 1126 case NVPTXISD::TexUnified1DS32Float: 1127 return "NVPTXISD::TexUnified1DS32Float"; 1128 case NVPTXISD::TexUnified1DS32FloatLevel: 1129 return "NVPTXISD::TexUnified1DS32FloatLevel"; 1130 case NVPTXISD::TexUnified1DS32FloatGrad: 1131 return "NVPTXISD::TexUnified1DS32FloatGrad"; 1132 case NVPTXISD::TexUnified1DU32S32: 1133 return "NVPTXISD::TexUnified1DU32S32"; 1134 case NVPTXISD::TexUnified1DU32Float: 1135 return "NVPTXISD::TexUnified1DU32Float"; 1136 case NVPTXISD::TexUnified1DU32FloatLevel: 1137 return "NVPTXISD::TexUnified1DU32FloatLevel"; 1138 case NVPTXISD::TexUnified1DU32FloatGrad: 1139 return "NVPTXISD::TexUnified1DU32FloatGrad"; 1140 case NVPTXISD::TexUnified1DArrayFloatS32: 1141 return "NVPTXISD::TexUnified1DArrayFloatS32"; 1142 case NVPTXISD::TexUnified1DArrayFloatFloat: 1143 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 1144 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 1145 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 1146 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 1147 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 1148 case NVPTXISD::TexUnified1DArrayS32S32: 1149 return "NVPTXISD::TexUnified1DArrayS32S32"; 1150 case NVPTXISD::TexUnified1DArrayS32Float: 1151 return "NVPTXISD::TexUnified1DArrayS32Float"; 1152 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 1153 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 1154 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 1155 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 1156 case NVPTXISD::TexUnified1DArrayU32S32: 1157 return "NVPTXISD::TexUnified1DArrayU32S32"; 1158 case NVPTXISD::TexUnified1DArrayU32Float: 1159 return "NVPTXISD::TexUnified1DArrayU32Float"; 1160 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 1161 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 1162 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 1163 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 1164 case NVPTXISD::TexUnified2DFloatS32: 1165 return "NVPTXISD::TexUnified2DFloatS32"; 1166 case NVPTXISD::TexUnified2DFloatFloat: 1167 return "NVPTXISD::TexUnified2DFloatFloat"; 1168 case NVPTXISD::TexUnified2DFloatFloatLevel: 1169 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 1170 case NVPTXISD::TexUnified2DFloatFloatGrad: 1171 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 1172 case NVPTXISD::TexUnified2DS32S32: 1173 return "NVPTXISD::TexUnified2DS32S32"; 1174 case NVPTXISD::TexUnified2DS32Float: 1175 return "NVPTXISD::TexUnified2DS32Float"; 1176 case NVPTXISD::TexUnified2DS32FloatLevel: 1177 return "NVPTXISD::TexUnified2DS32FloatLevel"; 1178 case NVPTXISD::TexUnified2DS32FloatGrad: 1179 return "NVPTXISD::TexUnified2DS32FloatGrad"; 1180 case NVPTXISD::TexUnified2DU32S32: 1181 return "NVPTXISD::TexUnified2DU32S32"; 1182 case NVPTXISD::TexUnified2DU32Float: 1183 return "NVPTXISD::TexUnified2DU32Float"; 1184 case NVPTXISD::TexUnified2DU32FloatLevel: 1185 return "NVPTXISD::TexUnified2DU32FloatLevel"; 1186 case NVPTXISD::TexUnified2DU32FloatGrad: 1187 return "NVPTXISD::TexUnified2DU32FloatGrad"; 1188 case NVPTXISD::TexUnified2DArrayFloatS32: 1189 return "NVPTXISD::TexUnified2DArrayFloatS32"; 1190 case NVPTXISD::TexUnified2DArrayFloatFloat: 1191 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 1192 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 1193 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 1194 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 1195 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 1196 case NVPTXISD::TexUnified2DArrayS32S32: 1197 return "NVPTXISD::TexUnified2DArrayS32S32"; 1198 case NVPTXISD::TexUnified2DArrayS32Float: 1199 return "NVPTXISD::TexUnified2DArrayS32Float"; 1200 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 1201 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 1202 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 1203 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 1204 case NVPTXISD::TexUnified2DArrayU32S32: 1205 return "NVPTXISD::TexUnified2DArrayU32S32"; 1206 case NVPTXISD::TexUnified2DArrayU32Float: 1207 return "NVPTXISD::TexUnified2DArrayU32Float"; 1208 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 1209 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 1210 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 1211 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 1212 case NVPTXISD::TexUnified3DFloatS32: 1213 return "NVPTXISD::TexUnified3DFloatS32"; 1214 case NVPTXISD::TexUnified3DFloatFloat: 1215 return "NVPTXISD::TexUnified3DFloatFloat"; 1216 case NVPTXISD::TexUnified3DFloatFloatLevel: 1217 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 1218 case NVPTXISD::TexUnified3DFloatFloatGrad: 1219 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 1220 case NVPTXISD::TexUnified3DS32S32: 1221 return "NVPTXISD::TexUnified3DS32S32"; 1222 case NVPTXISD::TexUnified3DS32Float: 1223 return "NVPTXISD::TexUnified3DS32Float"; 1224 case NVPTXISD::TexUnified3DS32FloatLevel: 1225 return "NVPTXISD::TexUnified3DS32FloatLevel"; 1226 case NVPTXISD::TexUnified3DS32FloatGrad: 1227 return "NVPTXISD::TexUnified3DS32FloatGrad"; 1228 case NVPTXISD::TexUnified3DU32S32: 1229 return "NVPTXISD::TexUnified3DU32S32"; 1230 case NVPTXISD::TexUnified3DU32Float: 1231 return "NVPTXISD::TexUnified3DU32Float"; 1232 case NVPTXISD::TexUnified3DU32FloatLevel: 1233 return "NVPTXISD::TexUnified3DU32FloatLevel"; 1234 case NVPTXISD::TexUnified3DU32FloatGrad: 1235 return "NVPTXISD::TexUnified3DU32FloatGrad"; 1236 case NVPTXISD::TexUnifiedCubeFloatFloat: 1237 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1238 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1239 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1240 case NVPTXISD::TexUnifiedCubeS32Float: 1241 return "NVPTXISD::TexUnifiedCubeS32Float"; 1242 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1243 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1244 case NVPTXISD::TexUnifiedCubeU32Float: 1245 return "NVPTXISD::TexUnifiedCubeU32Float"; 1246 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1247 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1248 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1249 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1250 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1251 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1252 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1253 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1254 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1255 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1256 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1257 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1258 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1259 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1260 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1261 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1262 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1263 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1264 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1265 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1266 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1267 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1268 case NVPTXISD::Tld4UnifiedR2DS64Float: 1269 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1270 case NVPTXISD::Tld4UnifiedG2DS64Float: 1271 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1272 case NVPTXISD::Tld4UnifiedB2DS64Float: 1273 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1274 case NVPTXISD::Tld4UnifiedA2DS64Float: 1275 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1276 case NVPTXISD::Tld4UnifiedR2DU64Float: 1277 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1278 case NVPTXISD::Tld4UnifiedG2DU64Float: 1279 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1280 case NVPTXISD::Tld4UnifiedB2DU64Float: 1281 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1282 case NVPTXISD::Tld4UnifiedA2DU64Float: 1283 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1284 1285 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1286 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1287 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1288 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1289 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1290 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1291 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1292 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1293 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1294 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1295 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1296 1297 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1298 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1299 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1300 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1301 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1302 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1303 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1304 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1305 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1306 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1307 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1308 1309 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1310 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1311 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1312 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1313 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1314 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1315 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1316 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1317 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1318 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1319 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1320 1321 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1322 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1323 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1324 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1325 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1326 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1327 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1328 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1329 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1330 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1331 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1332 1333 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1334 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1335 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1336 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1337 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1338 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1339 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1340 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1341 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1342 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1343 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1344 1345 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1346 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1347 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1348 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1349 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1350 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1351 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1352 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1353 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1354 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1355 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1356 1357 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1358 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1359 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1360 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1361 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1362 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1363 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1364 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1365 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1366 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1367 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1368 1369 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1370 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1371 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1372 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1373 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1374 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1375 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1376 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1377 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1378 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1379 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1380 1381 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1382 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1383 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1384 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1385 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1386 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1387 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1388 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1389 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1390 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1391 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1392 1393 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1394 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1395 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1396 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1397 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1398 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1399 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1400 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1401 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1402 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1403 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1404 1405 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1406 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1407 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1408 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1409 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1410 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1411 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1412 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1413 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1414 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1415 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1416 1417 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1418 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1419 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1420 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1421 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1422 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1423 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1424 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1425 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1426 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1427 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1428 1429 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1430 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1431 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1432 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1433 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1434 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1435 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1436 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1437 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1438 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1439 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1440 1441 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1442 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1443 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1444 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1445 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1446 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1447 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1448 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1449 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1450 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1451 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1452 1453 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1454 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1455 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1456 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1457 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1458 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1459 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1460 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1461 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1462 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1463 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1464 } 1465 return nullptr; 1466 } 1467 1468 TargetLoweringBase::LegalizeTypeAction 1469 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1470 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1471 VT.getScalarType() == MVT::i1) 1472 return TypeSplitVector; 1473 if (Isv2x16VT(VT)) 1474 return TypeLegal; 1475 return TargetLoweringBase::getPreferredVectorAction(VT); 1476 } 1477 1478 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1479 int Enabled, int &ExtraSteps, 1480 bool &UseOneConst, 1481 bool Reciprocal) const { 1482 if (!(Enabled == ReciprocalEstimate::Enabled || 1483 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1484 return SDValue(); 1485 1486 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1487 ExtraSteps = 0; 1488 1489 SDLoc DL(Operand); 1490 EVT VT = Operand.getValueType(); 1491 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1492 1493 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1495 DAG.getConstant(IID, DL, MVT::i32), Operand); 1496 }; 1497 1498 // The sqrt and rsqrt refinement processes assume we always start out with an 1499 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1500 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1501 // any refinement, we must return a regular sqrt. 1502 if (Reciprocal || ExtraSteps > 0) { 1503 if (VT == MVT::f32) 1504 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1505 : Intrinsic::nvvm_rsqrt_approx_f); 1506 else if (VT == MVT::f64) 1507 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1508 else 1509 return SDValue(); 1510 } else { 1511 if (VT == MVT::f32) 1512 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1513 : Intrinsic::nvvm_sqrt_approx_f); 1514 else { 1515 // There's no sqrt.approx.f64 instruction, so we emit 1516 // reciprocal(rsqrt(x)). This is faster than 1517 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1518 // x * rsqrt(x).) 1519 return DAG.getNode( 1520 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1521 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1522 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1523 } 1524 } 1525 } 1526 1527 SDValue 1528 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1529 SDLoc dl(Op); 1530 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1531 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1532 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1533 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1534 } 1535 1536 static bool IsTypePassedAsArray(const Type *Ty) { 1537 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1538 Ty->isHalfTy() || Ty->isBFloatTy(); 1539 } 1540 1541 std::string NVPTXTargetLowering::getPrototype( 1542 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1543 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1544 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1545 const CallBase &CB, unsigned UniqueCallSite) const { 1546 auto PtrVT = getPointerTy(DL); 1547 1548 bool isABI = (STI.getSmVersion() >= 20); 1549 assert(isABI && "Non-ABI compilation is not supported"); 1550 if (!isABI) 1551 return ""; 1552 1553 std::string Prototype; 1554 raw_string_ostream O(Prototype); 1555 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1556 1557 if (retTy->getTypeID() == Type::VoidTyID) { 1558 O << "()"; 1559 } else { 1560 O << "("; 1561 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1562 !IsTypePassedAsArray(retTy)) { 1563 unsigned size = 0; 1564 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1565 size = ITy->getBitWidth(); 1566 } else { 1567 assert(retTy->isFloatingPointTy() && 1568 "Floating point type expected here"); 1569 size = retTy->getPrimitiveSizeInBits(); 1570 } 1571 // PTX ABI requires all scalar return values to be at least 32 1572 // bits in size. fp16 normally uses .b16 as its storage type in 1573 // PTX, so its size must be adjusted here, too. 1574 size = promoteScalarArgumentSize(size); 1575 1576 O << ".param .b" << size << " _"; 1577 } else if (isa<PointerType>(retTy)) { 1578 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1579 } else if (IsTypePassedAsArray(retTy)) { 1580 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1581 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1582 } else { 1583 llvm_unreachable("Unknown return type"); 1584 } 1585 O << ") "; 1586 } 1587 O << "_ ("; 1588 1589 bool first = true; 1590 1591 const Function *F = CB.getFunction(); 1592 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1593 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1594 Type *Ty = Args[i].Ty; 1595 if (!first) { 1596 O << ", "; 1597 } 1598 first = false; 1599 1600 if (!Outs[OIdx].Flags.isByVal()) { 1601 if (IsTypePassedAsArray(Ty)) { 1602 unsigned ParamAlign = 0; 1603 const CallInst *CallI = cast<CallInst>(&CB); 1604 // +1 because index 0 is reserved for return type alignment 1605 if (!getAlign(*CallI, i + 1, ParamAlign)) 1606 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1607 O << ".param .align " << ParamAlign << " .b8 "; 1608 O << "_"; 1609 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1610 // update the index for Outs 1611 SmallVector<EVT, 16> vtparts; 1612 ComputeValueVTs(*this, DL, Ty, vtparts); 1613 if (unsigned len = vtparts.size()) 1614 OIdx += len - 1; 1615 continue; 1616 } 1617 // i8 types in IR will be i16 types in SDAG 1618 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1619 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1620 "type mismatch between callee prototype and arguments"); 1621 // scalar type 1622 unsigned sz = 0; 1623 if (isa<IntegerType>(Ty)) { 1624 sz = cast<IntegerType>(Ty)->getBitWidth(); 1625 sz = promoteScalarArgumentSize(sz); 1626 } else if (isa<PointerType>(Ty)) { 1627 sz = PtrVT.getSizeInBits(); 1628 } else { 1629 sz = Ty->getPrimitiveSizeInBits(); 1630 } 1631 O << ".param .b" << sz << " "; 1632 O << "_"; 1633 continue; 1634 } 1635 1636 Type *ETy = Args[i].IndirectType; 1637 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1638 Align ParamByValAlign = 1639 getFunctionByValParamAlign(F, ETy, InitialAlign, DL); 1640 1641 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1642 O << "_"; 1643 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1644 } 1645 1646 if (VAInfo) 1647 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1648 << " .b8 _[]\n"; 1649 O << ")"; 1650 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1651 O << " .noreturn"; 1652 O << ";"; 1653 1654 return Prototype; 1655 } 1656 1657 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1658 const CallBase *CB, Type *Ty, 1659 unsigned Idx, 1660 const DataLayout &DL) const { 1661 if (!CB) { 1662 // CallSite is zero, fallback to ABI type alignment 1663 return DL.getABITypeAlign(Ty); 1664 } 1665 1666 unsigned Alignment = 0; 1667 const Function *DirectCallee = CB->getCalledFunction(); 1668 1669 if (!DirectCallee) { 1670 // We don't have a direct function symbol, but that may be because of 1671 // constant cast instructions in the call. 1672 1673 // With bitcast'd call targets, the instruction will be the call 1674 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1675 // Check if we have call alignment metadata 1676 if (getAlign(*CI, Idx, Alignment)) 1677 return Align(Alignment); 1678 } 1679 DirectCallee = getMaybeBitcastedCallee(CB); 1680 } 1681 1682 // Check for function alignment information if we found that the 1683 // ultimate target is a Function 1684 if (DirectCallee) { 1685 if (getAlign(*DirectCallee, Idx, Alignment)) 1686 return Align(Alignment); 1687 // If alignment information is not available, fall back to the 1688 // default function param optimized type alignment 1689 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1690 } 1691 1692 // Call is indirect, fall back to the ABI type alignment 1693 return DL.getABITypeAlign(Ty); 1694 } 1695 1696 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1697 SmallVectorImpl<SDValue> &InVals) const { 1698 1699 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1700 report_fatal_error( 1701 "Support for variadic functions (unsized array parameter) introduced " 1702 "in PTX ISA version 6.0 and requires target sm_30."); 1703 1704 SelectionDAG &DAG = CLI.DAG; 1705 SDLoc dl = CLI.DL; 1706 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1707 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1708 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1709 SDValue Chain = CLI.Chain; 1710 SDValue Callee = CLI.Callee; 1711 bool &isTailCall = CLI.IsTailCall; 1712 ArgListTy &Args = CLI.getArgs(); 1713 Type *RetTy = CLI.RetTy; 1714 const CallBase *CB = CLI.CB; 1715 const DataLayout &DL = DAG.getDataLayout(); 1716 1717 bool isABI = (STI.getSmVersion() >= 20); 1718 assert(isABI && "Non-ABI compilation is not supported"); 1719 if (!isABI) 1720 return Chain; 1721 1722 // Variadic arguments. 1723 // 1724 // Normally, for each argument, we declare a param scalar or a param 1725 // byte array in the .param space, and store the argument value to that 1726 // param scalar or array starting at offset 0. 1727 // 1728 // In the case of the first variadic argument, we declare a vararg byte array 1729 // with size 0. The exact size of this array isn't known at this point, so 1730 // it'll be patched later. All the variadic arguments will be stored to this 1731 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1732 // initially set to 0, so it can be used for non-variadic arguments (which use 1733 // 0 offset) to simplify the code. 1734 // 1735 // After all vararg is processed, 'VAOffset' holds the size of the 1736 // vararg byte array. 1737 1738 SDValue VADeclareParam; // vararg byte array 1739 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1740 unsigned VAOffset = 0; // current offset in the param array 1741 1742 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1743 SDValue TempChain = Chain; 1744 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1745 SDValue InGlue = Chain.getValue(1); 1746 1747 unsigned ParamCount = 0; 1748 // Args.size() and Outs.size() need not match. 1749 // Outs.size() will be larger 1750 // * if there is an aggregate argument with multiple fields (each field 1751 // showing up separately in Outs) 1752 // * if there is a vector argument with more than typical vector-length 1753 // elements (generally if more than 4) where each vector element is 1754 // individually present in Outs. 1755 // So a different index should be used for indexing into Outs/OutVals. 1756 // See similar issue in LowerFormalArguments. 1757 unsigned OIdx = 0; 1758 // Declare the .params or .reg need to pass values 1759 // to the function 1760 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1761 EVT VT = Outs[OIdx].VT; 1762 Type *Ty = Args[i].Ty; 1763 bool IsVAArg = (i >= CLI.NumFixedArgs); 1764 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1765 1766 SmallVector<EVT, 16> VTs; 1767 SmallVector<uint64_t, 16> Offsets; 1768 1769 assert((!IsByVal || Args[i].IndirectType) && 1770 "byval arg must have indirect type"); 1771 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1772 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1773 1774 Align ArgAlign; 1775 if (IsByVal) { 1776 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1777 // so we don't need to worry whether it's naturally aligned or not. 1778 // See TargetLowering::LowerCallTo(). 1779 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1780 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1781 InitialAlign, DL); 1782 if (IsVAArg) 1783 VAOffset = alignTo(VAOffset, ArgAlign); 1784 } else { 1785 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); 1786 } 1787 1788 unsigned TypeSize = 1789 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1790 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1791 1792 bool NeedAlign; // Does argument declaration specify alignment? 1793 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1794 if (IsVAArg) { 1795 if (ParamCount == FirstVAArg) { 1796 SDValue DeclareParamOps[] = { 1797 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1798 DAG.getConstant(ParamCount, dl, MVT::i32), 1799 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1800 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1801 DeclareParamVTs, DeclareParamOps); 1802 } 1803 NeedAlign = PassAsArray; 1804 } else if (PassAsArray) { 1805 // declare .param .align <align> .b8 .param<n>[<size>]; 1806 SDValue DeclareParamOps[] = { 1807 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1808 DAG.getConstant(ParamCount, dl, MVT::i32), 1809 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1810 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1811 DeclareParamOps); 1812 NeedAlign = true; 1813 } else { 1814 // declare .param .b<size> .param<n>; 1815 if (VT.isInteger() || VT.isFloatingPoint()) { 1816 // PTX ABI requires integral types to be at least 32 bits in 1817 // size. FP16 is loaded/stored using i16, so it's handled 1818 // here as well. 1819 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1820 } 1821 SDValue DeclareScalarParamOps[] = { 1822 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1823 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1824 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1825 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1826 DeclareScalarParamOps); 1827 NeedAlign = false; 1828 } 1829 InGlue = Chain.getValue(1); 1830 1831 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1832 // than 32-bits are sign extended or zero extended, depending on 1833 // whether they are signed or unsigned types. This case applies 1834 // only to scalar parameters and not to aggregate values. 1835 bool ExtendIntegerParam = 1836 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1837 1838 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1839 SmallVector<SDValue, 6> StoreOperands; 1840 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1841 EVT EltVT = VTs[j]; 1842 int CurOffset = Offsets[j]; 1843 MaybeAlign PartAlign; 1844 if (NeedAlign) 1845 PartAlign = commonAlignment(ArgAlign, CurOffset); 1846 1847 // New store. 1848 if (VectorInfo[j] & PVF_FIRST) { 1849 assert(StoreOperands.empty() && "Unfinished preceding store."); 1850 StoreOperands.push_back(Chain); 1851 StoreOperands.push_back( 1852 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1853 StoreOperands.push_back(DAG.getConstant( 1854 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1855 dl, MVT::i32)); 1856 } 1857 1858 SDValue StVal = OutVals[OIdx]; 1859 1860 MVT PromotedVT; 1861 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1862 EltVT = EVT(PromotedVT); 1863 } 1864 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1865 llvm::ISD::NodeType Ext = 1866 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1867 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1868 } 1869 1870 if (IsByVal) { 1871 auto PtrVT = getPointerTy(DL); 1872 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1873 DAG.getConstant(CurOffset, dl, PtrVT)); 1874 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1875 PartAlign); 1876 } else if (ExtendIntegerParam) { 1877 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1878 // zext/sext to i32 1879 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1880 : ISD::ZERO_EXTEND, 1881 dl, MVT::i32, StVal); 1882 } 1883 1884 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1885 // Use 16-bit registers for small stores as it's the 1886 // smallest general purpose register size supported by NVPTX. 1887 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1888 } 1889 1890 // Record the value to store. 1891 StoreOperands.push_back(StVal); 1892 1893 if (VectorInfo[j] & PVF_LAST) { 1894 unsigned NumElts = StoreOperands.size() - 3; 1895 NVPTXISD::NodeType Op; 1896 switch (NumElts) { 1897 case 1: 1898 Op = NVPTXISD::StoreParam; 1899 break; 1900 case 2: 1901 Op = NVPTXISD::StoreParamV2; 1902 break; 1903 case 4: 1904 Op = NVPTXISD::StoreParamV4; 1905 break; 1906 default: 1907 llvm_unreachable("Invalid vector info."); 1908 } 1909 1910 StoreOperands.push_back(InGlue); 1911 1912 // Adjust type of the store op if we've extended the scalar 1913 // return value. 1914 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1915 1916 Chain = DAG.getMemIntrinsicNode( 1917 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1918 TheStoreType, MachinePointerInfo(), PartAlign, 1919 MachineMemOperand::MOStore); 1920 InGlue = Chain.getValue(1); 1921 1922 // Cleanup. 1923 StoreOperands.clear(); 1924 1925 // TODO: We may need to support vector types that can be passed 1926 // as scalars in variadic arguments. 1927 if (!IsByVal && IsVAArg) { 1928 assert(NumElts == 1 && 1929 "Vectorization is expected to be disabled for variadics."); 1930 VAOffset += DL.getTypeAllocSize( 1931 TheStoreType.getTypeForEVT(*DAG.getContext())); 1932 } 1933 } 1934 if (!IsByVal) 1935 ++OIdx; 1936 } 1937 assert(StoreOperands.empty() && "Unfinished parameter store."); 1938 if (!IsByVal && VTs.size() > 0) 1939 --OIdx; 1940 ++ParamCount; 1941 if (IsByVal && IsVAArg) 1942 VAOffset += TypeSize; 1943 } 1944 1945 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1946 MaybeAlign retAlignment = std::nullopt; 1947 1948 // Handle Result 1949 if (Ins.size() > 0) { 1950 SmallVector<EVT, 16> resvtparts; 1951 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1952 1953 // Declare 1954 // .param .align N .b8 retval0[<size-in-bytes>], or 1955 // .param .b<size-in-bits> retval0 1956 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1957 if (!IsTypePassedAsArray(RetTy)) { 1958 resultsz = promoteScalarArgumentSize(resultsz); 1959 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1960 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1961 DAG.getConstant(resultsz, dl, MVT::i32), 1962 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1963 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1964 DeclareRetOps); 1965 InGlue = Chain.getValue(1); 1966 } else { 1967 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1968 assert(retAlignment && "retAlignment is guaranteed to be set"); 1969 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1970 SDValue DeclareRetOps[] = { 1971 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1972 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1973 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1974 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1975 DeclareRetOps); 1976 InGlue = Chain.getValue(1); 1977 } 1978 } 1979 1980 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1981 // Set the size of the vararg param byte array if the callee is a variadic 1982 // function and the variadic part is not empty. 1983 if (HasVAArgs) { 1984 SDValue DeclareParamOps[] = { 1985 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1986 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1987 VADeclareParam.getOperand(4)}; 1988 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1989 VADeclareParam->getVTList(), DeclareParamOps); 1990 } 1991 1992 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1993 // between them we must rely on the call site value which is valid for 1994 // indirect calls but is always null for libcalls. 1995 bool isIndirectCall = !Func && CB; 1996 1997 if (isa<ExternalSymbolSDNode>(Callee)) { 1998 Function* CalleeFunc = nullptr; 1999 2000 // Try to find the callee in the current module. 2001 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 2002 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 2003 2004 // Set the "libcall callee" attribute to indicate that the function 2005 // must always have a declaration. 2006 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 2007 } 2008 2009 if (isIndirectCall) { 2010 // This is indirect function call case : PTX requires a prototype of the 2011 // form 2012 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 2013 // to be emitted, and the label has to used as the last arg of call 2014 // instruction. 2015 // The prototype is embedded in a string and put as the operand for a 2016 // CallPrototype SDNode which will print out to the value of the string. 2017 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2018 std::string Proto = getPrototype( 2019 DL, RetTy, Args, Outs, retAlignment, 2020 HasVAArgs 2021 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 2022 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1))) 2023 : std::nullopt, 2024 *CB, UniqueCallSite); 2025 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 2026 SDValue ProtoOps[] = { 2027 Chain, 2028 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 2029 InGlue, 2030 }; 2031 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 2032 InGlue = Chain.getValue(1); 2033 } 2034 // Op to just print "call" 2035 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2036 SDValue PrintCallOps[] = { 2037 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 2038 }; 2039 // We model convergent calls as separate opcodes. 2040 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 2041 if (CLI.IsConvergent) 2042 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 2043 : NVPTXISD::PrintConvergentCall; 2044 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 2045 InGlue = Chain.getValue(1); 2046 2047 // Ops to print out the function name 2048 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2049 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 2050 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 2051 InGlue = Chain.getValue(1); 2052 2053 // Ops to print out the param list 2054 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2055 SDValue CallArgBeginOps[] = { Chain, InGlue }; 2056 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 2057 CallArgBeginOps); 2058 InGlue = Chain.getValue(1); 2059 2060 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 2061 ++i) { 2062 unsigned opcode; 2063 if (i == (e - 1)) 2064 opcode = NVPTXISD::LastCallArg; 2065 else 2066 opcode = NVPTXISD::CallArg; 2067 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2068 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 2069 DAG.getConstant(i, dl, MVT::i32), InGlue }; 2070 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 2071 InGlue = Chain.getValue(1); 2072 } 2073 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2074 SDValue CallArgEndOps[] = { Chain, 2075 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 2076 InGlue }; 2077 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 2078 InGlue = Chain.getValue(1); 2079 2080 if (isIndirectCall) { 2081 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2082 SDValue PrototypeOps[] = { 2083 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 2084 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 2085 InGlue = Chain.getValue(1); 2086 } 2087 2088 SmallVector<SDValue, 16> ProxyRegOps; 2089 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 2090 2091 // Generate loads from param memory/moves from registers for result 2092 if (Ins.size() > 0) { 2093 SmallVector<EVT, 16> VTs; 2094 SmallVector<uint64_t, 16> Offsets; 2095 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 2096 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 2097 2098 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 2099 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 2100 2101 SmallVector<EVT, 6> LoadVTs; 2102 int VecIdx = -1; // Index of the first element of the vector. 2103 2104 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2105 // 32-bits are sign extended or zero extended, depending on whether 2106 // they are signed or unsigned types. 2107 bool ExtendIntegerRetVal = 2108 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2109 2110 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2111 bool needTruncate = false; 2112 EVT TheLoadType = VTs[i]; 2113 EVT EltType = Ins[i].VT; 2114 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 2115 MVT PromotedVT; 2116 2117 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 2118 TheLoadType = EVT(PromotedVT); 2119 EltType = EVT(PromotedVT); 2120 needTruncate = true; 2121 } 2122 2123 if (ExtendIntegerRetVal) { 2124 TheLoadType = MVT::i32; 2125 EltType = MVT::i32; 2126 needTruncate = true; 2127 } else if (TheLoadType.getSizeInBits() < 16) { 2128 if (VTs[i].isInteger()) 2129 needTruncate = true; 2130 EltType = MVT::i16; 2131 } 2132 2133 // Record index of the very first element of the vector. 2134 if (VectorInfo[i] & PVF_FIRST) { 2135 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2136 VecIdx = i; 2137 } 2138 2139 LoadVTs.push_back(EltType); 2140 2141 if (VectorInfo[i] & PVF_LAST) { 2142 unsigned NumElts = LoadVTs.size(); 2143 LoadVTs.push_back(MVT::Other); 2144 LoadVTs.push_back(MVT::Glue); 2145 NVPTXISD::NodeType Op; 2146 switch (NumElts) { 2147 case 1: 2148 Op = NVPTXISD::LoadParam; 2149 break; 2150 case 2: 2151 Op = NVPTXISD::LoadParamV2; 2152 break; 2153 case 4: 2154 Op = NVPTXISD::LoadParamV4; 2155 break; 2156 default: 2157 llvm_unreachable("Invalid vector info."); 2158 } 2159 2160 SDValue LoadOperands[] = { 2161 Chain, DAG.getConstant(1, dl, MVT::i32), 2162 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2163 SDValue RetVal = DAG.getMemIntrinsicNode( 2164 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2165 MachinePointerInfo(), EltAlign, 2166 MachineMemOperand::MOLoad); 2167 2168 for (unsigned j = 0; j < NumElts; ++j) { 2169 ProxyRegOps.push_back(RetVal.getValue(j)); 2170 2171 if (needTruncate) 2172 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2173 else 2174 ProxyRegTruncates.push_back(std::optional<MVT>()); 2175 } 2176 2177 Chain = RetVal.getValue(NumElts); 2178 InGlue = RetVal.getValue(NumElts + 1); 2179 2180 // Cleanup 2181 VecIdx = -1; 2182 LoadVTs.clear(); 2183 } 2184 } 2185 } 2186 2187 Chain = 2188 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2189 InGlue = Chain.getValue(1); 2190 2191 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2192 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2193 // dangling. 2194 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2195 SDValue Ret = DAG.getNode( 2196 NVPTXISD::ProxyReg, dl, 2197 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2198 { Chain, ProxyRegOps[i], InGlue } 2199 ); 2200 2201 Chain = Ret.getValue(1); 2202 InGlue = Ret.getValue(2); 2203 2204 if (ProxyRegTruncates[i]) { 2205 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2206 } 2207 2208 InVals.push_back(Ret); 2209 } 2210 2211 // set isTailCall to false for now, until we figure out how to express 2212 // tail call optimization in PTX 2213 isTailCall = false; 2214 return Chain; 2215 } 2216 2217 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 2218 SelectionDAG &DAG) const { 2219 const Function &Fn = DAG.getMachineFunction().getFunction(); 2220 2221 DiagnosticInfoUnsupported NoDynamicAlloca( 2222 Fn, "dynamic alloca unsupported by NVPTX backend", 2223 SDLoc(Op).getDebugLoc()); 2224 DAG.getContext()->diagnose(NoDynamicAlloca); 2225 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 2226 return DAG.getMergeValues(Ops, SDLoc()); 2227 } 2228 2229 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2230 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2231 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2232 SDValue 2233 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2234 SDNode *Node = Op.getNode(); 2235 SDLoc dl(Node); 2236 SmallVector<SDValue, 8> Ops; 2237 unsigned NumOperands = Node->getNumOperands(); 2238 for (unsigned i = 0; i < NumOperands; ++i) { 2239 SDValue SubOp = Node->getOperand(i); 2240 EVT VVT = SubOp.getNode()->getValueType(0); 2241 EVT EltVT = VVT.getVectorElementType(); 2242 unsigned NumSubElem = VVT.getVectorNumElements(); 2243 for (unsigned j = 0; j < NumSubElem; ++j) { 2244 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2245 DAG.getIntPtrConstant(j, dl))); 2246 } 2247 } 2248 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2249 } 2250 2251 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2252 // would get lowered as two constant loads and vector-packing move. 2253 // Instead we want just a constant move: 2254 // mov.b32 %r2, 0x40003C00 2255 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2256 SelectionDAG &DAG) const { 2257 EVT VT = Op->getValueType(0); 2258 if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) 2259 return Op; 2260 2261 SDLoc DL(Op); 2262 2263 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { 2264 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || 2265 isa<ConstantFPSDNode>(Operand); 2266 })) { 2267 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2268 // to optimize calculation of constant parts. 2269 if (VT == MVT::v4i8) { 2270 SDValue C8 = DAG.getConstant(8, DL, MVT::i32); 2271 SDValue E01 = DAG.getNode( 2272 NVPTXISD::BFI, DL, MVT::i32, 2273 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), 2274 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); 2275 SDValue E012 = 2276 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2277 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), 2278 E01, DAG.getConstant(16, DL, MVT::i32), C8); 2279 SDValue E0123 = 2280 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2281 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), 2282 E012, DAG.getConstant(24, DL, MVT::i32), C8); 2283 return DAG.getNode(ISD::BITCAST, DL, VT, E0123); 2284 } 2285 return Op; 2286 } 2287 2288 // Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2289 auto GetOperand = [](SDValue Op, int N) -> APInt { 2290 const SDValue &Operand = Op->getOperand(N); 2291 EVT VT = Op->getValueType(0); 2292 if (Operand->isUndef()) 2293 return APInt(32, 0); 2294 APInt Value; 2295 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2296 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); 2297 else if (VT == MVT::v2i16 || VT == MVT::v4i8) 2298 Value = Operand->getAsAPIntVal(); 2299 else 2300 llvm_unreachable("Unsupported type"); 2301 // i8 values are carried around as i16, so we need to zero out upper bits, 2302 // so they do not get in the way of combining individual byte values 2303 if (VT == MVT::v4i8) 2304 Value = Value.trunc(8); 2305 return Value.zext(32); 2306 }; 2307 APInt Value; 2308 if (Isv2x16VT(VT)) { 2309 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); 2310 } else if (VT == MVT::v4i8) { 2311 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | 2312 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); 2313 } else { 2314 llvm_unreachable("Unsupported type"); 2315 } 2316 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); 2317 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2318 } 2319 2320 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2321 SelectionDAG &DAG) const { 2322 SDValue Index = Op->getOperand(1); 2323 SDValue Vector = Op->getOperand(0); 2324 SDLoc DL(Op); 2325 EVT VectorVT = Vector.getValueType(); 2326 2327 if (VectorVT == MVT::v4i8) { 2328 SDValue BFE = 2329 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, 2330 {Vector, 2331 DAG.getNode(ISD::MUL, DL, MVT::i32, 2332 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2333 DAG.getConstant(8, DL, MVT::i32)), 2334 DAG.getConstant(8, DL, MVT::i32)}); 2335 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); 2336 } 2337 2338 // Constant index will be matched by tablegen. 2339 if (isa<ConstantSDNode>(Index.getNode())) 2340 return Op; 2341 2342 // Extract individual elements and select one of them. 2343 assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); 2344 EVT EltVT = VectorVT.getVectorElementType(); 2345 2346 SDLoc dl(Op.getNode()); 2347 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2348 DAG.getIntPtrConstant(0, dl)); 2349 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2350 DAG.getIntPtrConstant(1, dl)); 2351 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2352 ISD::CondCode::SETEQ); 2353 } 2354 2355 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 2356 SelectionDAG &DAG) const { 2357 SDValue Vector = Op->getOperand(0); 2358 EVT VectorVT = Vector.getValueType(); 2359 2360 if (VectorVT != MVT::v4i8) 2361 return Op; 2362 SDLoc DL(Op); 2363 SDValue Value = Op->getOperand(1); 2364 if (Value->isUndef()) 2365 return Vector; 2366 2367 SDValue Index = Op->getOperand(2); 2368 2369 SDValue BFI = 2370 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2371 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, 2372 DAG.getNode(ISD::MUL, DL, MVT::i32, 2373 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2374 DAG.getConstant(8, DL, MVT::i32)), 2375 DAG.getConstant(8, DL, MVT::i32)}); 2376 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); 2377 } 2378 2379 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 2380 SelectionDAG &DAG) const { 2381 SDValue V1 = Op.getOperand(0); 2382 EVT VectorVT = V1.getValueType(); 2383 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) 2384 return Op; 2385 2386 // Lower shuffle to PRMT instruction. 2387 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 2388 SDValue V2 = Op.getOperand(1); 2389 uint32_t Selector = 0; 2390 for (auto I : llvm::enumerate(SVN->getMask())) 2391 Selector |= (I.value() << (I.index() * 4)); 2392 2393 SDLoc DL(Op); 2394 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, 2395 DAG.getConstant(Selector, DL, MVT::i32), 2396 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); 2397 } 2398 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2399 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2400 /// amount, or 2401 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2402 /// amount. 2403 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2404 SelectionDAG &DAG) const { 2405 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2406 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2407 2408 EVT VT = Op.getValueType(); 2409 unsigned VTBits = VT.getSizeInBits(); 2410 SDLoc dl(Op); 2411 SDValue ShOpLo = Op.getOperand(0); 2412 SDValue ShOpHi = Op.getOperand(1); 2413 SDValue ShAmt = Op.getOperand(2); 2414 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2415 2416 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2417 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2418 // {dHi, dLo} = {aHi, aLo} >> Amt 2419 // dHi = aHi >> Amt 2420 // dLo = shf.r.clamp aLo, aHi, Amt 2421 2422 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2423 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2424 ShAmt); 2425 2426 SDValue Ops[2] = { Lo, Hi }; 2427 return DAG.getMergeValues(Ops, dl); 2428 } 2429 else { 2430 // {dHi, dLo} = {aHi, aLo} >> Amt 2431 // - if (Amt>=size) then 2432 // dLo = aHi >> (Amt-size) 2433 // dHi = aHi >> Amt (this is either all 0 or all 1) 2434 // else 2435 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2436 // dHi = aHi >> Amt 2437 2438 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2439 DAG.getConstant(VTBits, dl, MVT::i32), 2440 ShAmt); 2441 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2442 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2443 DAG.getConstant(VTBits, dl, MVT::i32)); 2444 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2445 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2446 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2447 2448 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2449 DAG.getConstant(VTBits, dl, MVT::i32), 2450 ISD::SETGE); 2451 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2452 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2453 2454 SDValue Ops[2] = { Lo, Hi }; 2455 return DAG.getMergeValues(Ops, dl); 2456 } 2457 } 2458 2459 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2460 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2461 /// amount, or 2462 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2463 /// amount. 2464 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2465 SelectionDAG &DAG) const { 2466 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2467 assert(Op.getOpcode() == ISD::SHL_PARTS); 2468 2469 EVT VT = Op.getValueType(); 2470 unsigned VTBits = VT.getSizeInBits(); 2471 SDLoc dl(Op); 2472 SDValue ShOpLo = Op.getOperand(0); 2473 SDValue ShOpHi = Op.getOperand(1); 2474 SDValue ShAmt = Op.getOperand(2); 2475 2476 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2477 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2478 // {dHi, dLo} = {aHi, aLo} << Amt 2479 // dHi = shf.l.clamp aLo, aHi, Amt 2480 // dLo = aLo << Amt 2481 2482 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2483 ShAmt); 2484 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2485 2486 SDValue Ops[2] = { Lo, Hi }; 2487 return DAG.getMergeValues(Ops, dl); 2488 } 2489 else { 2490 // {dHi, dLo} = {aHi, aLo} << Amt 2491 // - if (Amt>=size) then 2492 // dLo = aLo << Amt (all 0) 2493 // dLo = aLo << (Amt-size) 2494 // else 2495 // dLo = aLo << Amt 2496 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2497 2498 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2499 DAG.getConstant(VTBits, dl, MVT::i32), 2500 ShAmt); 2501 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2502 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2503 DAG.getConstant(VTBits, dl, MVT::i32)); 2504 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2505 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2506 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2507 2508 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2509 DAG.getConstant(VTBits, dl, MVT::i32), 2510 ISD::SETGE); 2511 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2512 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2513 2514 SDValue Ops[2] = { Lo, Hi }; 2515 return DAG.getMergeValues(Ops, dl); 2516 } 2517 } 2518 2519 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2520 EVT VT = Op.getValueType(); 2521 2522 if (VT == MVT::f32) 2523 return LowerFROUND32(Op, DAG); 2524 2525 if (VT == MVT::f64) 2526 return LowerFROUND64(Op, DAG); 2527 2528 llvm_unreachable("unhandled type"); 2529 } 2530 2531 // This is the the rounding method used in CUDA libdevice in C like code: 2532 // float roundf(float A) 2533 // { 2534 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2535 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2536 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2537 // } 2538 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2539 SelectionDAG &DAG) const { 2540 SDLoc SL(Op); 2541 SDValue A = Op.getOperand(0); 2542 EVT VT = Op.getValueType(); 2543 2544 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2545 2546 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2547 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2548 const int SignBitMask = 0x80000000; 2549 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2550 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2551 const int PointFiveInBits = 0x3F000000; 2552 SDValue PointFiveWithSignRaw = 2553 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2554 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2555 SDValue PointFiveWithSign = 2556 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2557 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2558 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2559 2560 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2561 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2562 SDValue IsLarge = 2563 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2564 ISD::SETOGT); 2565 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2566 2567 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2568 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2569 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2570 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2571 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2572 } 2573 2574 // The implementation of round(double) is similar to that of round(float) in 2575 // that they both separate the value range into three regions and use a method 2576 // specific to the region to round the values. However, round(double) first 2577 // calculates the round of the absolute value and then adds the sign back while 2578 // round(float) directly rounds the value with sign. 2579 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2580 SelectionDAG &DAG) const { 2581 SDLoc SL(Op); 2582 SDValue A = Op.getOperand(0); 2583 EVT VT = Op.getValueType(); 2584 2585 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2586 2587 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2588 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2589 DAG.getConstantFP(0.5, SL, VT)); 2590 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2591 2592 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2593 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2594 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2595 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2596 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2597 DAG.getConstantFP(0, SL, VT), 2598 RoundedA); 2599 2600 // Add sign to rounded_A 2601 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2602 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2603 2604 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2605 SDValue IsLarge = 2606 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2607 ISD::SETOGT); 2608 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2609 } 2610 2611 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, 2612 SelectionDAG &DAG) const { 2613 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2614 2615 if (Op.getValueType() == MVT::bf16) { 2616 SDLoc Loc(Op); 2617 return DAG.getNode( 2618 ISD::FP_ROUND, Loc, MVT::bf16, 2619 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), 2620 DAG.getIntPtrConstant(0, Loc)); 2621 } 2622 2623 // Everything else is considered legal. 2624 return Op; 2625 } 2626 2627 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, 2628 SelectionDAG &DAG) const { 2629 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2630 2631 if (Op.getOperand(0).getValueType() == MVT::bf16) { 2632 SDLoc Loc(Op); 2633 return DAG.getNode( 2634 Op.getOpcode(), Loc, Op.getValueType(), 2635 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); 2636 } 2637 2638 // Everything else is considered legal. 2639 return Op; 2640 } 2641 2642 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { 2643 SDLoc DL(Op); 2644 if (Op.getValueType() != MVT::v2i16) 2645 return Op; 2646 EVT EltVT = Op.getValueType().getVectorElementType(); 2647 SmallVector<SDValue> VecElements; 2648 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { 2649 SmallVector<SDValue> ScalarArgs; 2650 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), 2651 [&](const SDUse &O) { 2652 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 2653 O.get(), DAG.getIntPtrConstant(I, DL)); 2654 }); 2655 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); 2656 } 2657 SDValue V = 2658 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); 2659 return V; 2660 } 2661 2662 SDValue 2663 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2664 switch (Op.getOpcode()) { 2665 case ISD::RETURNADDR: 2666 return SDValue(); 2667 case ISD::FRAMEADDR: 2668 return SDValue(); 2669 case ISD::GlobalAddress: 2670 return LowerGlobalAddress(Op, DAG); 2671 case ISD::INTRINSIC_W_CHAIN: 2672 return Op; 2673 case ISD::BUILD_VECTOR: 2674 return LowerBUILD_VECTOR(Op, DAG); 2675 case ISD::EXTRACT_SUBVECTOR: 2676 return Op; 2677 case ISD::EXTRACT_VECTOR_ELT: 2678 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2679 case ISD::INSERT_VECTOR_ELT: 2680 return LowerINSERT_VECTOR_ELT(Op, DAG); 2681 case ISD::VECTOR_SHUFFLE: 2682 return LowerVECTOR_SHUFFLE(Op, DAG); 2683 case ISD::CONCAT_VECTORS: 2684 return LowerCONCAT_VECTORS(Op, DAG); 2685 case ISD::STORE: 2686 return LowerSTORE(Op, DAG); 2687 case ISD::LOAD: 2688 return LowerLOAD(Op, DAG); 2689 case ISD::SHL_PARTS: 2690 return LowerShiftLeftParts(Op, DAG); 2691 case ISD::SRA_PARTS: 2692 case ISD::SRL_PARTS: 2693 return LowerShiftRightParts(Op, DAG); 2694 case ISD::SELECT: 2695 return LowerSelect(Op, DAG); 2696 case ISD::FROUND: 2697 return LowerFROUND(Op, DAG); 2698 case ISD::SINT_TO_FP: 2699 case ISD::UINT_TO_FP: 2700 return LowerINT_TO_FP(Op, DAG); 2701 case ISD::FP_TO_SINT: 2702 case ISD::FP_TO_UINT: 2703 return LowerFP_TO_INT(Op, DAG); 2704 case ISD::VAARG: 2705 return LowerVAARG(Op, DAG); 2706 case ISD::VASTART: 2707 return LowerVASTART(Op, DAG); 2708 case ISD::ABS: 2709 case ISD::SMIN: 2710 case ISD::SMAX: 2711 case ISD::UMIN: 2712 case ISD::UMAX: 2713 case ISD::ADD: 2714 case ISD::SUB: 2715 case ISD::MUL: 2716 case ISD::SHL: 2717 case ISD::SREM: 2718 case ISD::UREM: 2719 return LowerVectorArith(Op, DAG); 2720 case ISD::DYNAMIC_STACKALLOC: 2721 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2722 default: 2723 llvm_unreachable("Custom lowering not defined for operation"); 2724 } 2725 } 2726 2727 // This function is almost a copy of SelectionDAG::expandVAArg(). 2728 // The only diff is that this one produces loads from local address space. 2729 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2730 const TargetLowering *TLI = STI.getTargetLowering(); 2731 SDLoc DL(Op); 2732 2733 SDNode *Node = Op.getNode(); 2734 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2735 EVT VT = Node->getValueType(0); 2736 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2737 SDValue Tmp1 = Node->getOperand(0); 2738 SDValue Tmp2 = Node->getOperand(1); 2739 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2740 2741 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2742 Tmp1, Tmp2, MachinePointerInfo(V)); 2743 SDValue VAList = VAListLoad; 2744 2745 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2746 VAList = DAG.getNode( 2747 ISD::ADD, DL, VAList.getValueType(), VAList, 2748 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2749 2750 VAList = DAG.getNode( 2751 ISD::AND, DL, VAList.getValueType(), VAList, 2752 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2753 } 2754 2755 // Increment the pointer, VAList, to the next vaarg 2756 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2757 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2758 DL, VAList.getValueType())); 2759 2760 // Store the incremented VAList to the legalized pointer 2761 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2762 MachinePointerInfo(V)); 2763 2764 const Value *SrcV = 2765 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2766 2767 // Load the actual argument out of the pointer VAList 2768 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2769 } 2770 2771 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2772 const TargetLowering *TLI = STI.getTargetLowering(); 2773 SDLoc DL(Op); 2774 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2775 2776 // Store the address of unsized array <function>_vararg[] in the ap object. 2777 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2778 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2779 2780 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2781 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2782 MachinePointerInfo(SV)); 2783 } 2784 2785 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2786 SDValue Op0 = Op->getOperand(0); 2787 SDValue Op1 = Op->getOperand(1); 2788 SDValue Op2 = Op->getOperand(2); 2789 SDLoc DL(Op.getNode()); 2790 2791 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2792 2793 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2794 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2795 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2796 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2797 2798 return Trunc; 2799 } 2800 2801 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2802 if (Op.getValueType() == MVT::i1) 2803 return LowerLOADi1(Op, DAG); 2804 2805 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2806 // unaligned loads and have to handle it here. 2807 EVT VT = Op.getValueType(); 2808 if (Isv2x16VT(VT) || VT == MVT::v4i8) { 2809 LoadSDNode *Load = cast<LoadSDNode>(Op); 2810 EVT MemVT = Load->getMemoryVT(); 2811 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2812 MemVT, *Load->getMemOperand())) { 2813 SDValue Ops[2]; 2814 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2815 return DAG.getMergeValues(Ops, SDLoc(Op)); 2816 } 2817 } 2818 2819 return SDValue(); 2820 } 2821 2822 // v = ld i1* addr 2823 // => 2824 // v1 = ld i8* addr (-> i16) 2825 // v = trunc i16 to i1 2826 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2827 SDNode *Node = Op.getNode(); 2828 LoadSDNode *LD = cast<LoadSDNode>(Node); 2829 SDLoc dl(Node); 2830 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2831 assert(Node->getValueType(0) == MVT::i1 && 2832 "Custom lowering for i1 load only"); 2833 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2834 LD->getPointerInfo(), LD->getAlign(), 2835 LD->getMemOperand()->getFlags()); 2836 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2837 // The legalizer (the caller) is expecting two values from the legalized 2838 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2839 // in LegalizeDAG.cpp which also uses MergeValues. 2840 SDValue Ops[] = { result, LD->getChain() }; 2841 return DAG.getMergeValues(Ops, dl); 2842 } 2843 2844 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2845 StoreSDNode *Store = cast<StoreSDNode>(Op); 2846 EVT VT = Store->getMemoryVT(); 2847 2848 if (VT == MVT::i1) 2849 return LowerSTOREi1(Op, DAG); 2850 2851 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2852 // stores and have to handle it here. 2853 if ((Isv2x16VT(VT) || VT == MVT::v4i8) && 2854 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2855 VT, *Store->getMemOperand())) 2856 return expandUnalignedStore(Store, DAG); 2857 2858 // v2f16, v2bf16 and v2i16 don't need special handling. 2859 if (Isv2x16VT(VT) || VT == MVT::v4i8) 2860 return SDValue(); 2861 2862 if (VT.isVector()) 2863 return LowerSTOREVector(Op, DAG); 2864 2865 return SDValue(); 2866 } 2867 2868 SDValue 2869 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2870 SDNode *N = Op.getNode(); 2871 SDValue Val = N->getOperand(1); 2872 SDLoc DL(N); 2873 EVT ValVT = Val.getValueType(); 2874 2875 if (ValVT.isVector()) { 2876 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2877 // legal. We can (and should) split that into 2 stores of <2 x double> here 2878 // but I'm leaving that as a TODO for now. 2879 if (!ValVT.isSimple()) 2880 return SDValue(); 2881 switch (ValVT.getSimpleVT().SimpleTy) { 2882 default: 2883 return SDValue(); 2884 case MVT::v2i8: 2885 case MVT::v2i16: 2886 case MVT::v2i32: 2887 case MVT::v2i64: 2888 case MVT::v2f16: 2889 case MVT::v2bf16: 2890 case MVT::v2f32: 2891 case MVT::v2f64: 2892 case MVT::v4i8: 2893 case MVT::v4i16: 2894 case MVT::v4i32: 2895 case MVT::v4f16: 2896 case MVT::v4bf16: 2897 case MVT::v4f32: 2898 case MVT::v8f16: // <4 x f16x2> 2899 case MVT::v8bf16: // <4 x bf16x2> 2900 case MVT::v8i16: // <4 x i16x2> 2901 // This is a "native" vector type 2902 break; 2903 } 2904 2905 MemSDNode *MemSD = cast<MemSDNode>(N); 2906 const DataLayout &TD = DAG.getDataLayout(); 2907 2908 Align Alignment = MemSD->getAlign(); 2909 Align PrefAlign = 2910 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2911 if (Alignment < PrefAlign) { 2912 // This store is not sufficiently aligned, so bail out and let this vector 2913 // store be scalarized. Note that we may still be able to emit smaller 2914 // vector stores. For example, if we are storing a <4 x float> with an 2915 // alignment of 8, this check will fail but the legalizer will try again 2916 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2917 return SDValue(); 2918 } 2919 2920 unsigned Opcode = 0; 2921 EVT EltVT = ValVT.getVectorElementType(); 2922 unsigned NumElts = ValVT.getVectorNumElements(); 2923 2924 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2925 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2926 // stored type to i16 and propagate the "real" type as the memory type. 2927 bool NeedExt = false; 2928 if (EltVT.getSizeInBits() < 16) 2929 NeedExt = true; 2930 2931 bool StoreF16x2 = false; 2932 switch (NumElts) { 2933 default: 2934 return SDValue(); 2935 case 2: 2936 Opcode = NVPTXISD::StoreV2; 2937 break; 2938 case 4: 2939 Opcode = NVPTXISD::StoreV4; 2940 break; 2941 case 8: 2942 // v8f16 is a special case. PTX doesn't have st.v8.f16 2943 // instruction. Instead, we split the vector into v2f16 chunks and 2944 // store them with st.v4.b32. 2945 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); 2946 Opcode = NVPTXISD::StoreV4; 2947 StoreF16x2 = true; 2948 break; 2949 } 2950 2951 SmallVector<SDValue, 8> Ops; 2952 2953 // First is the chain 2954 Ops.push_back(N->getOperand(0)); 2955 2956 if (StoreF16x2) { 2957 // Combine f16,f16 -> v2f16 2958 NumElts /= 2; 2959 for (unsigned i = 0; i < NumElts; ++i) { 2960 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2961 DAG.getIntPtrConstant(i * 2, DL)); 2962 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2963 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2964 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 2965 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 2966 Ops.push_back(V2); 2967 } 2968 } else { 2969 // Then the split values 2970 for (unsigned i = 0; i < NumElts; ++i) { 2971 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2972 DAG.getIntPtrConstant(i, DL)); 2973 if (NeedExt) 2974 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2975 Ops.push_back(ExtVal); 2976 } 2977 } 2978 2979 // Then any remaining arguments 2980 Ops.append(N->op_begin() + 2, N->op_end()); 2981 2982 SDValue NewSt = 2983 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2984 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2985 2986 // return DCI.CombineTo(N, NewSt, true); 2987 return NewSt; 2988 } 2989 2990 return SDValue(); 2991 } 2992 2993 // st i1 v, addr 2994 // => 2995 // v1 = zxt v to i16 2996 // st.u8 i16, addr 2997 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2998 SDNode *Node = Op.getNode(); 2999 SDLoc dl(Node); 3000 StoreSDNode *ST = cast<StoreSDNode>(Node); 3001 SDValue Tmp1 = ST->getChain(); 3002 SDValue Tmp2 = ST->getBasePtr(); 3003 SDValue Tmp3 = ST->getValue(); 3004 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 3005 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 3006 SDValue Result = 3007 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 3008 ST->getAlign(), ST->getMemOperand()->getFlags()); 3009 return Result; 3010 } 3011 3012 // This creates target external symbol for a function parameter. 3013 // Name of the symbol is composed from its index and the function name. 3014 // Negative index corresponds to special parameter (unsized array) used for 3015 // passing variable arguments. 3016 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 3017 EVT v) const { 3018 StringRef SavedStr = nvTM->getStrPool().save( 3019 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 3020 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 3021 } 3022 3023 SDValue NVPTXTargetLowering::LowerFormalArguments( 3024 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3025 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3026 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3027 MachineFunction &MF = DAG.getMachineFunction(); 3028 const DataLayout &DL = DAG.getDataLayout(); 3029 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3030 3031 const Function *F = &MF.getFunction(); 3032 const AttributeList &PAL = F->getAttributes(); 3033 const TargetLowering *TLI = STI.getTargetLowering(); 3034 3035 SDValue Root = DAG.getRoot(); 3036 std::vector<SDValue> OutChains; 3037 3038 bool isABI = (STI.getSmVersion() >= 20); 3039 assert(isABI && "Non-ABI compilation is not supported"); 3040 if (!isABI) 3041 return Chain; 3042 3043 std::vector<Type *> argTypes; 3044 std::vector<const Argument *> theArgs; 3045 for (const Argument &I : F->args()) { 3046 theArgs.push_back(&I); 3047 argTypes.push_back(I.getType()); 3048 } 3049 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3050 // Ins.size() will be larger 3051 // * if there is an aggregate argument with multiple fields (each field 3052 // showing up separately in Ins) 3053 // * if there is a vector argument with more than typical vector-length 3054 // elements (generally if more than 4) where each vector element is 3055 // individually present in Ins. 3056 // So a different index should be used for indexing into Ins. 3057 // See similar issue in LowerCall. 3058 unsigned InsIdx = 0; 3059 3060 int idx = 0; 3061 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 3062 Type *Ty = argTypes[i]; 3063 3064 if (theArgs[i]->use_empty()) { 3065 // argument is dead 3066 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 3067 SmallVector<EVT, 16> vtparts; 3068 3069 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 3070 if (vtparts.empty()) 3071 report_fatal_error("Empty parameter types are not supported"); 3072 3073 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 3074 ++parti) { 3075 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3076 ++InsIdx; 3077 } 3078 if (vtparts.size() > 0) 3079 --InsIdx; 3080 continue; 3081 } 3082 if (Ty->isVectorTy()) { 3083 EVT ObjectVT = getValueType(DL, Ty); 3084 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 3085 for (unsigned parti = 0; parti < NumRegs; ++parti) { 3086 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3087 ++InsIdx; 3088 } 3089 if (NumRegs > 0) 3090 --InsIdx; 3091 continue; 3092 } 3093 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3094 continue; 3095 } 3096 3097 // In the following cases, assign a node order of "idx+1" 3098 // to newly created nodes. The SDNodes for params have to 3099 // appear in the same order as their order of appearance 3100 // in the original function. "idx+1" holds that order. 3101 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 3102 bool aggregateIsPacked = false; 3103 if (StructType *STy = dyn_cast<StructType>(Ty)) 3104 aggregateIsPacked = STy->isPacked(); 3105 3106 SmallVector<EVT, 16> VTs; 3107 SmallVector<uint64_t, 16> Offsets; 3108 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 3109 if (VTs.empty()) 3110 report_fatal_error("Empty parameter types are not supported"); 3111 3112 auto VectorInfo = 3113 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 3114 3115 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3116 int VecIdx = -1; // Index of the first element of the current vector. 3117 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 3118 if (VectorInfo[parti] & PVF_FIRST) { 3119 assert(VecIdx == -1 && "Orphaned vector."); 3120 VecIdx = parti; 3121 } 3122 3123 // That's the last element of this store op. 3124 if (VectorInfo[parti] & PVF_LAST) { 3125 unsigned NumElts = parti - VecIdx + 1; 3126 EVT EltVT = VTs[parti]; 3127 // i1 is loaded/stored as i8. 3128 EVT LoadVT = EltVT; 3129 if (EltVT == MVT::i1) 3130 LoadVT = MVT::i8; 3131 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) 3132 // getLoad needs a vector type, but it can't handle 3133 // vectors which contain v2f16 or v2bf16 elements. So we must load 3134 // using i32 here and then bitcast back. 3135 LoadVT = MVT::i32; 3136 3137 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 3138 SDValue VecAddr = 3139 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 3140 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 3141 Value *srcValue = Constant::getNullValue(PointerType::get( 3142 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 3143 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 3144 MachinePointerInfo(srcValue), 3145 MaybeAlign(aggregateIsPacked ? 1 : 0), 3146 MachineMemOperand::MODereferenceable | 3147 MachineMemOperand::MOInvariant); 3148 if (P.getNode()) 3149 P.getNode()->setIROrder(idx + 1); 3150 for (unsigned j = 0; j < NumElts; ++j) { 3151 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 3152 DAG.getIntPtrConstant(j, dl)); 3153 // We've loaded i1 as an i8 and now must truncate it back to i1 3154 if (EltVT == MVT::i1) 3155 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 3156 // v2f16 was loaded as an i32. Now we must bitcast it back. 3157 else if (EltVT != LoadVT) 3158 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 3159 3160 // If a promoted integer type is used, truncate down to the original 3161 MVT PromotedVT; 3162 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 3163 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 3164 } 3165 3166 // Extend the element if necessary (e.g. an i8 is loaded 3167 // into an i16 register) 3168 if (Ins[InsIdx].VT.isInteger() && 3169 Ins[InsIdx].VT.getFixedSizeInBits() > 3170 LoadVT.getFixedSizeInBits()) { 3171 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 3172 : ISD::ZERO_EXTEND; 3173 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 3174 } 3175 InVals.push_back(Elt); 3176 } 3177 3178 // Reset vector tracking state. 3179 VecIdx = -1; 3180 } 3181 ++InsIdx; 3182 } 3183 if (VTs.size() > 0) 3184 --InsIdx; 3185 continue; 3186 } 3187 3188 // Param has ByVal attribute 3189 // Return MoveParam(param symbol). 3190 // Ideally, the param symbol can be returned directly, 3191 // but when SDNode builder decides to use it in a CopyToReg(), 3192 // machine instruction fails because TargetExternalSymbol 3193 // (not lowered) is target dependent, and CopyToReg assumes 3194 // the source is lowered. 3195 EVT ObjectVT = getValueType(DL, Ty); 3196 assert(ObjectVT == Ins[InsIdx].VT && 3197 "Ins type did not match function type"); 3198 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3199 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 3200 if (p.getNode()) 3201 p.getNode()->setIROrder(idx + 1); 3202 InVals.push_back(p); 3203 } 3204 3205 if (!OutChains.empty()) 3206 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 3207 3208 return Chain; 3209 } 3210 3211 SDValue 3212 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3213 bool isVarArg, 3214 const SmallVectorImpl<ISD::OutputArg> &Outs, 3215 const SmallVectorImpl<SDValue> &OutVals, 3216 const SDLoc &dl, SelectionDAG &DAG) const { 3217 const MachineFunction &MF = DAG.getMachineFunction(); 3218 const Function &F = MF.getFunction(); 3219 Type *RetTy = MF.getFunction().getReturnType(); 3220 3221 bool isABI = (STI.getSmVersion() >= 20); 3222 assert(isABI && "Non-ABI compilation is not supported"); 3223 if (!isABI) 3224 return Chain; 3225 3226 const DataLayout &DL = DAG.getDataLayout(); 3227 SmallVector<SDValue, 16> PromotedOutVals; 3228 SmallVector<EVT, 16> VTs; 3229 SmallVector<uint64_t, 16> Offsets; 3230 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 3231 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 3232 3233 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3234 SDValue PromotedOutVal = OutVals[i]; 3235 MVT PromotedVT; 3236 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 3237 VTs[i] = EVT(PromotedVT); 3238 } 3239 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 3240 llvm::ISD::NodeType Ext = 3241 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3242 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 3243 } 3244 PromotedOutVals.push_back(PromotedOutVal); 3245 } 3246 3247 auto VectorInfo = VectorizePTXValueVTs( 3248 VTs, Offsets, 3249 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 3250 : Align(1)); 3251 3252 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3253 // 32-bits are sign extended or zero extended, depending on whether 3254 // they are signed or unsigned types. 3255 bool ExtendIntegerRetVal = 3256 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 3257 3258 SmallVector<SDValue, 6> StoreOperands; 3259 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3260 // New load/store. Record chain and offset operands. 3261 if (VectorInfo[i] & PVF_FIRST) { 3262 assert(StoreOperands.empty() && "Orphaned operand list."); 3263 StoreOperands.push_back(Chain); 3264 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 3265 } 3266 3267 SDValue OutVal = OutVals[i]; 3268 SDValue RetVal = PromotedOutVals[i]; 3269 3270 if (ExtendIntegerRetVal) { 3271 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 3272 : ISD::ZERO_EXTEND, 3273 dl, MVT::i32, RetVal); 3274 } else if (OutVal.getValueSizeInBits() < 16) { 3275 // Use 16-bit registers for small load-stores as it's the 3276 // smallest general purpose register size supported by NVPTX. 3277 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 3278 } 3279 3280 // Record the value to return. 3281 StoreOperands.push_back(RetVal); 3282 3283 // That's the last element of this store op. 3284 if (VectorInfo[i] & PVF_LAST) { 3285 NVPTXISD::NodeType Op; 3286 unsigned NumElts = StoreOperands.size() - 2; 3287 switch (NumElts) { 3288 case 1: 3289 Op = NVPTXISD::StoreRetval; 3290 break; 3291 case 2: 3292 Op = NVPTXISD::StoreRetvalV2; 3293 break; 3294 case 4: 3295 Op = NVPTXISD::StoreRetvalV4; 3296 break; 3297 default: 3298 llvm_unreachable("Invalid vector info."); 3299 } 3300 3301 // Adjust type of load/store op if we've extended the scalar 3302 // return value. 3303 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3304 Chain = DAG.getMemIntrinsicNode( 3305 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 3306 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 3307 // Cleanup vector state. 3308 StoreOperands.clear(); 3309 } 3310 } 3311 3312 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 3313 } 3314 3315 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 3316 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 3317 SelectionDAG &DAG) const { 3318 if (Constraint.size() > 1) 3319 return; 3320 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3321 } 3322 3323 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 3324 switch (Intrinsic) { 3325 default: 3326 return 0; 3327 3328 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3329 return NVPTXISD::Tex1DFloatS32; 3330 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3331 return NVPTXISD::Tex1DFloatFloat; 3332 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3333 return NVPTXISD::Tex1DFloatFloatLevel; 3334 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3335 return NVPTXISD::Tex1DFloatFloatGrad; 3336 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3337 return NVPTXISD::Tex1DS32S32; 3338 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3339 return NVPTXISD::Tex1DS32Float; 3340 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3341 return NVPTXISD::Tex1DS32FloatLevel; 3342 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3343 return NVPTXISD::Tex1DS32FloatGrad; 3344 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3345 return NVPTXISD::Tex1DU32S32; 3346 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3347 return NVPTXISD::Tex1DU32Float; 3348 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3349 return NVPTXISD::Tex1DU32FloatLevel; 3350 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3351 return NVPTXISD::Tex1DU32FloatGrad; 3352 3353 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3354 return NVPTXISD::Tex1DArrayFloatS32; 3355 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3356 return NVPTXISD::Tex1DArrayFloatFloat; 3357 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3358 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3359 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3360 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3361 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3362 return NVPTXISD::Tex1DArrayS32S32; 3363 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3364 return NVPTXISD::Tex1DArrayS32Float; 3365 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3366 return NVPTXISD::Tex1DArrayS32FloatLevel; 3367 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3368 return NVPTXISD::Tex1DArrayS32FloatGrad; 3369 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3370 return NVPTXISD::Tex1DArrayU32S32; 3371 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3372 return NVPTXISD::Tex1DArrayU32Float; 3373 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3374 return NVPTXISD::Tex1DArrayU32FloatLevel; 3375 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3376 return NVPTXISD::Tex1DArrayU32FloatGrad; 3377 3378 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3379 return NVPTXISD::Tex2DFloatS32; 3380 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3381 return NVPTXISD::Tex2DFloatFloat; 3382 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3383 return NVPTXISD::Tex2DFloatFloatLevel; 3384 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3385 return NVPTXISD::Tex2DFloatFloatGrad; 3386 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3387 return NVPTXISD::Tex2DS32S32; 3388 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3389 return NVPTXISD::Tex2DS32Float; 3390 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3391 return NVPTXISD::Tex2DS32FloatLevel; 3392 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3393 return NVPTXISD::Tex2DS32FloatGrad; 3394 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3395 return NVPTXISD::Tex2DU32S32; 3396 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3397 return NVPTXISD::Tex2DU32Float; 3398 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3399 return NVPTXISD::Tex2DU32FloatLevel; 3400 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3401 return NVPTXISD::Tex2DU32FloatGrad; 3402 3403 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3404 return NVPTXISD::Tex2DArrayFloatS32; 3405 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3406 return NVPTXISD::Tex2DArrayFloatFloat; 3407 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3408 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3409 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3410 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3411 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3412 return NVPTXISD::Tex2DArrayS32S32; 3413 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3414 return NVPTXISD::Tex2DArrayS32Float; 3415 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3416 return NVPTXISD::Tex2DArrayS32FloatLevel; 3417 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3418 return NVPTXISD::Tex2DArrayS32FloatGrad; 3419 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3420 return NVPTXISD::Tex2DArrayU32S32; 3421 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3422 return NVPTXISD::Tex2DArrayU32Float; 3423 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3424 return NVPTXISD::Tex2DArrayU32FloatLevel; 3425 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3426 return NVPTXISD::Tex2DArrayU32FloatGrad; 3427 3428 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3429 return NVPTXISD::Tex3DFloatS32; 3430 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3431 return NVPTXISD::Tex3DFloatFloat; 3432 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3433 return NVPTXISD::Tex3DFloatFloatLevel; 3434 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3435 return NVPTXISD::Tex3DFloatFloatGrad; 3436 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3437 return NVPTXISD::Tex3DS32S32; 3438 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3439 return NVPTXISD::Tex3DS32Float; 3440 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3441 return NVPTXISD::Tex3DS32FloatLevel; 3442 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3443 return NVPTXISD::Tex3DS32FloatGrad; 3444 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3445 return NVPTXISD::Tex3DU32S32; 3446 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3447 return NVPTXISD::Tex3DU32Float; 3448 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3449 return NVPTXISD::Tex3DU32FloatLevel; 3450 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3451 return NVPTXISD::Tex3DU32FloatGrad; 3452 3453 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3454 return NVPTXISD::TexCubeFloatFloat; 3455 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3456 return NVPTXISD::TexCubeFloatFloatLevel; 3457 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3458 return NVPTXISD::TexCubeS32Float; 3459 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3460 return NVPTXISD::TexCubeS32FloatLevel; 3461 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3462 return NVPTXISD::TexCubeU32Float; 3463 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3464 return NVPTXISD::TexCubeU32FloatLevel; 3465 3466 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3467 return NVPTXISD::TexCubeArrayFloatFloat; 3468 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3469 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3470 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3471 return NVPTXISD::TexCubeArrayS32Float; 3472 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3473 return NVPTXISD::TexCubeArrayS32FloatLevel; 3474 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3475 return NVPTXISD::TexCubeArrayU32Float; 3476 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3477 return NVPTXISD::TexCubeArrayU32FloatLevel; 3478 3479 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3480 return NVPTXISD::Tld4R2DFloatFloat; 3481 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3482 return NVPTXISD::Tld4G2DFloatFloat; 3483 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3484 return NVPTXISD::Tld4B2DFloatFloat; 3485 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3486 return NVPTXISD::Tld4A2DFloatFloat; 3487 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3488 return NVPTXISD::Tld4R2DS64Float; 3489 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3490 return NVPTXISD::Tld4G2DS64Float; 3491 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3492 return NVPTXISD::Tld4B2DS64Float; 3493 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3494 return NVPTXISD::Tld4A2DS64Float; 3495 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3496 return NVPTXISD::Tld4R2DU64Float; 3497 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3498 return NVPTXISD::Tld4G2DU64Float; 3499 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3500 return NVPTXISD::Tld4B2DU64Float; 3501 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3502 return NVPTXISD::Tld4A2DU64Float; 3503 3504 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3505 return NVPTXISD::TexUnified1DFloatS32; 3506 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3507 return NVPTXISD::TexUnified1DFloatFloat; 3508 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3509 return NVPTXISD::TexUnified1DFloatFloatLevel; 3510 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3511 return NVPTXISD::TexUnified1DFloatFloatGrad; 3512 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3513 return NVPTXISD::TexUnified1DS32S32; 3514 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3515 return NVPTXISD::TexUnified1DS32Float; 3516 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3517 return NVPTXISD::TexUnified1DS32FloatLevel; 3518 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3519 return NVPTXISD::TexUnified1DS32FloatGrad; 3520 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3521 return NVPTXISD::TexUnified1DU32S32; 3522 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3523 return NVPTXISD::TexUnified1DU32Float; 3524 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3525 return NVPTXISD::TexUnified1DU32FloatLevel; 3526 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3527 return NVPTXISD::TexUnified1DU32FloatGrad; 3528 3529 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3530 return NVPTXISD::TexUnified1DArrayFloatS32; 3531 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3532 return NVPTXISD::TexUnified1DArrayFloatFloat; 3533 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3534 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3535 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3536 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3537 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3538 return NVPTXISD::TexUnified1DArrayS32S32; 3539 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3540 return NVPTXISD::TexUnified1DArrayS32Float; 3541 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3542 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3543 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3544 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3545 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3546 return NVPTXISD::TexUnified1DArrayU32S32; 3547 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3548 return NVPTXISD::TexUnified1DArrayU32Float; 3549 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3550 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3551 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3552 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3553 3554 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3555 return NVPTXISD::TexUnified2DFloatS32; 3556 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3557 return NVPTXISD::TexUnified2DFloatFloat; 3558 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3559 return NVPTXISD::TexUnified2DFloatFloatLevel; 3560 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3561 return NVPTXISD::TexUnified2DFloatFloatGrad; 3562 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3563 return NVPTXISD::TexUnified2DS32S32; 3564 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3565 return NVPTXISD::TexUnified2DS32Float; 3566 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3567 return NVPTXISD::TexUnified2DS32FloatLevel; 3568 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3569 return NVPTXISD::TexUnified2DS32FloatGrad; 3570 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3571 return NVPTXISD::TexUnified2DU32S32; 3572 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3573 return NVPTXISD::TexUnified2DU32Float; 3574 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3575 return NVPTXISD::TexUnified2DU32FloatLevel; 3576 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3577 return NVPTXISD::TexUnified2DU32FloatGrad; 3578 3579 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3580 return NVPTXISD::TexUnified2DArrayFloatS32; 3581 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3582 return NVPTXISD::TexUnified2DArrayFloatFloat; 3583 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3584 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3585 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3586 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3587 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3588 return NVPTXISD::TexUnified2DArrayS32S32; 3589 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3590 return NVPTXISD::TexUnified2DArrayS32Float; 3591 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3592 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3593 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3594 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3595 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3596 return NVPTXISD::TexUnified2DArrayU32S32; 3597 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3598 return NVPTXISD::TexUnified2DArrayU32Float; 3599 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3600 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3601 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3602 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3603 3604 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3605 return NVPTXISD::TexUnified3DFloatS32; 3606 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3607 return NVPTXISD::TexUnified3DFloatFloat; 3608 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3609 return NVPTXISD::TexUnified3DFloatFloatLevel; 3610 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3611 return NVPTXISD::TexUnified3DFloatFloatGrad; 3612 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3613 return NVPTXISD::TexUnified3DS32S32; 3614 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3615 return NVPTXISD::TexUnified3DS32Float; 3616 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3617 return NVPTXISD::TexUnified3DS32FloatLevel; 3618 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3619 return NVPTXISD::TexUnified3DS32FloatGrad; 3620 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3621 return NVPTXISD::TexUnified3DU32S32; 3622 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3623 return NVPTXISD::TexUnified3DU32Float; 3624 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3625 return NVPTXISD::TexUnified3DU32FloatLevel; 3626 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3627 return NVPTXISD::TexUnified3DU32FloatGrad; 3628 3629 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3630 return NVPTXISD::TexUnifiedCubeFloatFloat; 3631 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3632 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3633 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3634 return NVPTXISD::TexUnifiedCubeS32Float; 3635 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3636 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3637 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3638 return NVPTXISD::TexUnifiedCubeU32Float; 3639 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3640 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3641 3642 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3643 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3644 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3645 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3646 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3647 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3648 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3649 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3650 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3651 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3652 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3653 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3654 3655 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3656 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3657 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3658 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3659 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3660 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3661 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3662 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3663 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3664 return NVPTXISD::Tld4UnifiedR2DS64Float; 3665 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3666 return NVPTXISD::Tld4UnifiedG2DS64Float; 3667 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3668 return NVPTXISD::Tld4UnifiedB2DS64Float; 3669 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3670 return NVPTXISD::Tld4UnifiedA2DS64Float; 3671 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3672 return NVPTXISD::Tld4UnifiedR2DU64Float; 3673 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3674 return NVPTXISD::Tld4UnifiedG2DU64Float; 3675 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3676 return NVPTXISD::Tld4UnifiedB2DU64Float; 3677 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3678 return NVPTXISD::Tld4UnifiedA2DU64Float; 3679 } 3680 } 3681 3682 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3683 switch (Intrinsic) { 3684 default: 3685 return 0; 3686 case Intrinsic::nvvm_suld_1d_i8_clamp: 3687 return NVPTXISD::Suld1DI8Clamp; 3688 case Intrinsic::nvvm_suld_1d_i16_clamp: 3689 return NVPTXISD::Suld1DI16Clamp; 3690 case Intrinsic::nvvm_suld_1d_i32_clamp: 3691 return NVPTXISD::Suld1DI32Clamp; 3692 case Intrinsic::nvvm_suld_1d_i64_clamp: 3693 return NVPTXISD::Suld1DI64Clamp; 3694 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3695 return NVPTXISD::Suld1DV2I8Clamp; 3696 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3697 return NVPTXISD::Suld1DV2I16Clamp; 3698 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3699 return NVPTXISD::Suld1DV2I32Clamp; 3700 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3701 return NVPTXISD::Suld1DV2I64Clamp; 3702 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3703 return NVPTXISD::Suld1DV4I8Clamp; 3704 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3705 return NVPTXISD::Suld1DV4I16Clamp; 3706 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3707 return NVPTXISD::Suld1DV4I32Clamp; 3708 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3709 return NVPTXISD::Suld1DArrayI8Clamp; 3710 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3711 return NVPTXISD::Suld1DArrayI16Clamp; 3712 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3713 return NVPTXISD::Suld1DArrayI32Clamp; 3714 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3715 return NVPTXISD::Suld1DArrayI64Clamp; 3716 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3717 return NVPTXISD::Suld1DArrayV2I8Clamp; 3718 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3719 return NVPTXISD::Suld1DArrayV2I16Clamp; 3720 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3721 return NVPTXISD::Suld1DArrayV2I32Clamp; 3722 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3723 return NVPTXISD::Suld1DArrayV2I64Clamp; 3724 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3725 return NVPTXISD::Suld1DArrayV4I8Clamp; 3726 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3727 return NVPTXISD::Suld1DArrayV4I16Clamp; 3728 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3729 return NVPTXISD::Suld1DArrayV4I32Clamp; 3730 case Intrinsic::nvvm_suld_2d_i8_clamp: 3731 return NVPTXISD::Suld2DI8Clamp; 3732 case Intrinsic::nvvm_suld_2d_i16_clamp: 3733 return NVPTXISD::Suld2DI16Clamp; 3734 case Intrinsic::nvvm_suld_2d_i32_clamp: 3735 return NVPTXISD::Suld2DI32Clamp; 3736 case Intrinsic::nvvm_suld_2d_i64_clamp: 3737 return NVPTXISD::Suld2DI64Clamp; 3738 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3739 return NVPTXISD::Suld2DV2I8Clamp; 3740 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3741 return NVPTXISD::Suld2DV2I16Clamp; 3742 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3743 return NVPTXISD::Suld2DV2I32Clamp; 3744 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3745 return NVPTXISD::Suld2DV2I64Clamp; 3746 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3747 return NVPTXISD::Suld2DV4I8Clamp; 3748 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3749 return NVPTXISD::Suld2DV4I16Clamp; 3750 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3751 return NVPTXISD::Suld2DV4I32Clamp; 3752 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3753 return NVPTXISD::Suld2DArrayI8Clamp; 3754 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3755 return NVPTXISD::Suld2DArrayI16Clamp; 3756 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3757 return NVPTXISD::Suld2DArrayI32Clamp; 3758 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3759 return NVPTXISD::Suld2DArrayI64Clamp; 3760 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3761 return NVPTXISD::Suld2DArrayV2I8Clamp; 3762 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3763 return NVPTXISD::Suld2DArrayV2I16Clamp; 3764 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3765 return NVPTXISD::Suld2DArrayV2I32Clamp; 3766 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3767 return NVPTXISD::Suld2DArrayV2I64Clamp; 3768 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3769 return NVPTXISD::Suld2DArrayV4I8Clamp; 3770 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3771 return NVPTXISD::Suld2DArrayV4I16Clamp; 3772 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3773 return NVPTXISD::Suld2DArrayV4I32Clamp; 3774 case Intrinsic::nvvm_suld_3d_i8_clamp: 3775 return NVPTXISD::Suld3DI8Clamp; 3776 case Intrinsic::nvvm_suld_3d_i16_clamp: 3777 return NVPTXISD::Suld3DI16Clamp; 3778 case Intrinsic::nvvm_suld_3d_i32_clamp: 3779 return NVPTXISD::Suld3DI32Clamp; 3780 case Intrinsic::nvvm_suld_3d_i64_clamp: 3781 return NVPTXISD::Suld3DI64Clamp; 3782 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3783 return NVPTXISD::Suld3DV2I8Clamp; 3784 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3785 return NVPTXISD::Suld3DV2I16Clamp; 3786 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3787 return NVPTXISD::Suld3DV2I32Clamp; 3788 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3789 return NVPTXISD::Suld3DV2I64Clamp; 3790 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3791 return NVPTXISD::Suld3DV4I8Clamp; 3792 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3793 return NVPTXISD::Suld3DV4I16Clamp; 3794 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3795 return NVPTXISD::Suld3DV4I32Clamp; 3796 case Intrinsic::nvvm_suld_1d_i8_trap: 3797 return NVPTXISD::Suld1DI8Trap; 3798 case Intrinsic::nvvm_suld_1d_i16_trap: 3799 return NVPTXISD::Suld1DI16Trap; 3800 case Intrinsic::nvvm_suld_1d_i32_trap: 3801 return NVPTXISD::Suld1DI32Trap; 3802 case Intrinsic::nvvm_suld_1d_i64_trap: 3803 return NVPTXISD::Suld1DI64Trap; 3804 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3805 return NVPTXISD::Suld1DV2I8Trap; 3806 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3807 return NVPTXISD::Suld1DV2I16Trap; 3808 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3809 return NVPTXISD::Suld1DV2I32Trap; 3810 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3811 return NVPTXISD::Suld1DV2I64Trap; 3812 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3813 return NVPTXISD::Suld1DV4I8Trap; 3814 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3815 return NVPTXISD::Suld1DV4I16Trap; 3816 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3817 return NVPTXISD::Suld1DV4I32Trap; 3818 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3819 return NVPTXISD::Suld1DArrayI8Trap; 3820 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3821 return NVPTXISD::Suld1DArrayI16Trap; 3822 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3823 return NVPTXISD::Suld1DArrayI32Trap; 3824 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3825 return NVPTXISD::Suld1DArrayI64Trap; 3826 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3827 return NVPTXISD::Suld1DArrayV2I8Trap; 3828 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3829 return NVPTXISD::Suld1DArrayV2I16Trap; 3830 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3831 return NVPTXISD::Suld1DArrayV2I32Trap; 3832 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3833 return NVPTXISD::Suld1DArrayV2I64Trap; 3834 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3835 return NVPTXISD::Suld1DArrayV4I8Trap; 3836 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3837 return NVPTXISD::Suld1DArrayV4I16Trap; 3838 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3839 return NVPTXISD::Suld1DArrayV4I32Trap; 3840 case Intrinsic::nvvm_suld_2d_i8_trap: 3841 return NVPTXISD::Suld2DI8Trap; 3842 case Intrinsic::nvvm_suld_2d_i16_trap: 3843 return NVPTXISD::Suld2DI16Trap; 3844 case Intrinsic::nvvm_suld_2d_i32_trap: 3845 return NVPTXISD::Suld2DI32Trap; 3846 case Intrinsic::nvvm_suld_2d_i64_trap: 3847 return NVPTXISD::Suld2DI64Trap; 3848 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3849 return NVPTXISD::Suld2DV2I8Trap; 3850 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3851 return NVPTXISD::Suld2DV2I16Trap; 3852 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3853 return NVPTXISD::Suld2DV2I32Trap; 3854 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3855 return NVPTXISD::Suld2DV2I64Trap; 3856 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3857 return NVPTXISD::Suld2DV4I8Trap; 3858 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3859 return NVPTXISD::Suld2DV4I16Trap; 3860 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3861 return NVPTXISD::Suld2DV4I32Trap; 3862 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3863 return NVPTXISD::Suld2DArrayI8Trap; 3864 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3865 return NVPTXISD::Suld2DArrayI16Trap; 3866 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3867 return NVPTXISD::Suld2DArrayI32Trap; 3868 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3869 return NVPTXISD::Suld2DArrayI64Trap; 3870 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3871 return NVPTXISD::Suld2DArrayV2I8Trap; 3872 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3873 return NVPTXISD::Suld2DArrayV2I16Trap; 3874 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3875 return NVPTXISD::Suld2DArrayV2I32Trap; 3876 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3877 return NVPTXISD::Suld2DArrayV2I64Trap; 3878 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3879 return NVPTXISD::Suld2DArrayV4I8Trap; 3880 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3881 return NVPTXISD::Suld2DArrayV4I16Trap; 3882 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3883 return NVPTXISD::Suld2DArrayV4I32Trap; 3884 case Intrinsic::nvvm_suld_3d_i8_trap: 3885 return NVPTXISD::Suld3DI8Trap; 3886 case Intrinsic::nvvm_suld_3d_i16_trap: 3887 return NVPTXISD::Suld3DI16Trap; 3888 case Intrinsic::nvvm_suld_3d_i32_trap: 3889 return NVPTXISD::Suld3DI32Trap; 3890 case Intrinsic::nvvm_suld_3d_i64_trap: 3891 return NVPTXISD::Suld3DI64Trap; 3892 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3893 return NVPTXISD::Suld3DV2I8Trap; 3894 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3895 return NVPTXISD::Suld3DV2I16Trap; 3896 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3897 return NVPTXISD::Suld3DV2I32Trap; 3898 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3899 return NVPTXISD::Suld3DV2I64Trap; 3900 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3901 return NVPTXISD::Suld3DV4I8Trap; 3902 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3903 return NVPTXISD::Suld3DV4I16Trap; 3904 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3905 return NVPTXISD::Suld3DV4I32Trap; 3906 case Intrinsic::nvvm_suld_1d_i8_zero: 3907 return NVPTXISD::Suld1DI8Zero; 3908 case Intrinsic::nvvm_suld_1d_i16_zero: 3909 return NVPTXISD::Suld1DI16Zero; 3910 case Intrinsic::nvvm_suld_1d_i32_zero: 3911 return NVPTXISD::Suld1DI32Zero; 3912 case Intrinsic::nvvm_suld_1d_i64_zero: 3913 return NVPTXISD::Suld1DI64Zero; 3914 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3915 return NVPTXISD::Suld1DV2I8Zero; 3916 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3917 return NVPTXISD::Suld1DV2I16Zero; 3918 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3919 return NVPTXISD::Suld1DV2I32Zero; 3920 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3921 return NVPTXISD::Suld1DV2I64Zero; 3922 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3923 return NVPTXISD::Suld1DV4I8Zero; 3924 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3925 return NVPTXISD::Suld1DV4I16Zero; 3926 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3927 return NVPTXISD::Suld1DV4I32Zero; 3928 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3929 return NVPTXISD::Suld1DArrayI8Zero; 3930 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3931 return NVPTXISD::Suld1DArrayI16Zero; 3932 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3933 return NVPTXISD::Suld1DArrayI32Zero; 3934 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3935 return NVPTXISD::Suld1DArrayI64Zero; 3936 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3937 return NVPTXISD::Suld1DArrayV2I8Zero; 3938 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3939 return NVPTXISD::Suld1DArrayV2I16Zero; 3940 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3941 return NVPTXISD::Suld1DArrayV2I32Zero; 3942 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3943 return NVPTXISD::Suld1DArrayV2I64Zero; 3944 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3945 return NVPTXISD::Suld1DArrayV4I8Zero; 3946 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3947 return NVPTXISD::Suld1DArrayV4I16Zero; 3948 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3949 return NVPTXISD::Suld1DArrayV4I32Zero; 3950 case Intrinsic::nvvm_suld_2d_i8_zero: 3951 return NVPTXISD::Suld2DI8Zero; 3952 case Intrinsic::nvvm_suld_2d_i16_zero: 3953 return NVPTXISD::Suld2DI16Zero; 3954 case Intrinsic::nvvm_suld_2d_i32_zero: 3955 return NVPTXISD::Suld2DI32Zero; 3956 case Intrinsic::nvvm_suld_2d_i64_zero: 3957 return NVPTXISD::Suld2DI64Zero; 3958 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3959 return NVPTXISD::Suld2DV2I8Zero; 3960 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3961 return NVPTXISD::Suld2DV2I16Zero; 3962 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3963 return NVPTXISD::Suld2DV2I32Zero; 3964 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3965 return NVPTXISD::Suld2DV2I64Zero; 3966 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3967 return NVPTXISD::Suld2DV4I8Zero; 3968 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3969 return NVPTXISD::Suld2DV4I16Zero; 3970 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3971 return NVPTXISD::Suld2DV4I32Zero; 3972 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3973 return NVPTXISD::Suld2DArrayI8Zero; 3974 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3975 return NVPTXISD::Suld2DArrayI16Zero; 3976 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3977 return NVPTXISD::Suld2DArrayI32Zero; 3978 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3979 return NVPTXISD::Suld2DArrayI64Zero; 3980 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3981 return NVPTXISD::Suld2DArrayV2I8Zero; 3982 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3983 return NVPTXISD::Suld2DArrayV2I16Zero; 3984 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3985 return NVPTXISD::Suld2DArrayV2I32Zero; 3986 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3987 return NVPTXISD::Suld2DArrayV2I64Zero; 3988 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3989 return NVPTXISD::Suld2DArrayV4I8Zero; 3990 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3991 return NVPTXISD::Suld2DArrayV4I16Zero; 3992 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3993 return NVPTXISD::Suld2DArrayV4I32Zero; 3994 case Intrinsic::nvvm_suld_3d_i8_zero: 3995 return NVPTXISD::Suld3DI8Zero; 3996 case Intrinsic::nvvm_suld_3d_i16_zero: 3997 return NVPTXISD::Suld3DI16Zero; 3998 case Intrinsic::nvvm_suld_3d_i32_zero: 3999 return NVPTXISD::Suld3DI32Zero; 4000 case Intrinsic::nvvm_suld_3d_i64_zero: 4001 return NVPTXISD::Suld3DI64Zero; 4002 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4003 return NVPTXISD::Suld3DV2I8Zero; 4004 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4005 return NVPTXISD::Suld3DV2I16Zero; 4006 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4007 return NVPTXISD::Suld3DV2I32Zero; 4008 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4009 return NVPTXISD::Suld3DV2I64Zero; 4010 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4011 return NVPTXISD::Suld3DV4I8Zero; 4012 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4013 return NVPTXISD::Suld3DV4I16Zero; 4014 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4015 return NVPTXISD::Suld3DV4I32Zero; 4016 } 4017 } 4018 4019 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 4020 // TgtMemIntrinsic 4021 // because we need the information that is only available in the "Value" type 4022 // of destination 4023 // pointer. In particular, the address space information. 4024 bool NVPTXTargetLowering::getTgtMemIntrinsic( 4025 IntrinsicInfo &Info, const CallInst &I, 4026 MachineFunction &MF, unsigned Intrinsic) const { 4027 switch (Intrinsic) { 4028 default: 4029 return false; 4030 case Intrinsic::nvvm_match_all_sync_i32p: 4031 case Intrinsic::nvvm_match_all_sync_i64p: 4032 Info.opc = ISD::INTRINSIC_W_CHAIN; 4033 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 4034 // in order to model data exchange with other threads, but perform no real 4035 // memory accesses. 4036 Info.memVT = MVT::i1; 4037 4038 // Our result depends on both our and other thread's arguments. 4039 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4040 return true; 4041 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 4042 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 4043 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 4044 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 4045 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 4046 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 4047 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 4048 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 4049 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 4050 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 4051 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 4052 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 4053 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 4054 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 4055 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 4056 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 4057 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 4058 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 4059 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 4060 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 4061 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 4062 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 4063 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 4064 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 4065 Info.opc = ISD::INTRINSIC_W_CHAIN; 4066 Info.memVT = MVT::v8f16; 4067 Info.ptrVal = I.getArgOperand(0); 4068 Info.offset = 0; 4069 Info.flags = MachineMemOperand::MOLoad; 4070 Info.align = Align(16); 4071 return true; 4072 } 4073 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 4074 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 4075 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 4076 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 4077 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 4078 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 4079 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 4080 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 4081 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 4082 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 4083 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 4084 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 4085 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 4086 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 4087 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 4088 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 4089 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 4090 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 4091 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 4092 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 4093 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 4094 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 4095 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 4096 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 4097 Info.opc = ISD::INTRINSIC_W_CHAIN; 4098 Info.memVT = MVT::v2i32; 4099 Info.ptrVal = I.getArgOperand(0); 4100 Info.offset = 0; 4101 Info.flags = MachineMemOperand::MOLoad; 4102 Info.align = Align(8); 4103 return true; 4104 } 4105 4106 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 4107 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 4108 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 4109 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 4110 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 4111 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 4112 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 4113 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 4114 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 4115 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 4116 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 4117 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 4118 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 4119 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 4120 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 4121 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 4122 4123 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 4124 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 4125 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 4126 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 4127 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 4128 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 4129 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 4130 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 4131 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 4132 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 4133 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 4134 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 4135 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 4136 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 4137 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 4138 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 4139 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 4140 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 4141 Info.opc = ISD::INTRINSIC_W_CHAIN; 4142 Info.memVT = MVT::v4i32; 4143 Info.ptrVal = I.getArgOperand(0); 4144 Info.offset = 0; 4145 Info.flags = MachineMemOperand::MOLoad; 4146 Info.align = Align(16); 4147 return true; 4148 } 4149 4150 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 4151 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 4152 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 4153 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 4154 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 4155 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 4156 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 4157 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 4158 4159 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 4160 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 4161 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 4162 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 4163 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 4164 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 4165 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 4166 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 4167 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 4168 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 4169 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 4170 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 4171 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 4172 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 4173 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 4174 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 4175 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 4176 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 4177 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 4178 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 4179 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 4180 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 4181 Info.opc = ISD::INTRINSIC_W_CHAIN; 4182 Info.memVT = MVT::i32; 4183 Info.ptrVal = I.getArgOperand(0); 4184 Info.offset = 0; 4185 Info.flags = MachineMemOperand::MOLoad; 4186 Info.align = Align(4); 4187 return true; 4188 } 4189 4190 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 4191 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 4192 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 4193 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 4194 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 4195 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 4196 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 4197 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 4198 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 4199 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 4200 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 4201 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 4202 Info.opc = ISD::INTRINSIC_W_CHAIN; 4203 Info.memVT = MVT::v4f16; 4204 Info.ptrVal = I.getArgOperand(0); 4205 Info.offset = 0; 4206 Info.flags = MachineMemOperand::MOLoad; 4207 Info.align = Align(16); 4208 return true; 4209 } 4210 4211 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 4212 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 4213 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 4214 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 4215 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 4216 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 4217 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 4218 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 4219 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 4220 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 4221 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 4222 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 4223 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 4224 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 4225 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 4226 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 4227 Info.opc = ISD::INTRINSIC_W_CHAIN; 4228 Info.memVT = MVT::v8f32; 4229 Info.ptrVal = I.getArgOperand(0); 4230 Info.offset = 0; 4231 Info.flags = MachineMemOperand::MOLoad; 4232 Info.align = Align(16); 4233 return true; 4234 } 4235 4236 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 4237 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 4238 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 4239 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 4240 4241 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 4242 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 4243 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 4244 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 4245 4246 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 4247 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 4248 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 4249 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 4250 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 4251 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 4252 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 4253 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 4254 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 4255 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 4256 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 4257 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 4258 Info.opc = ISD::INTRINSIC_W_CHAIN; 4259 Info.memVT = MVT::v8i32; 4260 Info.ptrVal = I.getArgOperand(0); 4261 Info.offset = 0; 4262 Info.flags = MachineMemOperand::MOLoad; 4263 Info.align = Align(16); 4264 return true; 4265 } 4266 4267 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 4268 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 4269 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 4270 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 4271 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 4272 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 4273 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 4274 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 4275 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 4276 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 4277 Info.opc = ISD::INTRINSIC_W_CHAIN; 4278 Info.memVT = MVT::v2i32; 4279 Info.ptrVal = I.getArgOperand(0); 4280 Info.offset = 0; 4281 Info.flags = MachineMemOperand::MOLoad; 4282 Info.align = Align(8); 4283 return true; 4284 } 4285 4286 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 4287 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 4288 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 4289 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 4290 4291 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 4292 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 4293 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 4294 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 4295 Info.opc = ISD::INTRINSIC_W_CHAIN; 4296 Info.memVT = MVT::f64; 4297 Info.ptrVal = I.getArgOperand(0); 4298 Info.offset = 0; 4299 Info.flags = MachineMemOperand::MOLoad; 4300 Info.align = Align(8); 4301 return true; 4302 } 4303 4304 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 4305 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 4306 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 4307 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 4308 Info.opc = ISD::INTRINSIC_W_CHAIN; 4309 Info.memVT = MVT::v2f64; 4310 Info.ptrVal = I.getArgOperand(0); 4311 Info.offset = 0; 4312 Info.flags = MachineMemOperand::MOLoad; 4313 Info.align = Align(16); 4314 return true; 4315 } 4316 4317 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 4318 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 4319 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 4320 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 4321 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 4322 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 4323 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 4324 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 4325 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 4326 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 4327 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 4328 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 4329 Info.opc = ISD::INTRINSIC_VOID; 4330 Info.memVT = MVT::v4f16; 4331 Info.ptrVal = I.getArgOperand(0); 4332 Info.offset = 0; 4333 Info.flags = MachineMemOperand::MOStore; 4334 Info.align = Align(16); 4335 return true; 4336 } 4337 4338 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4339 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4340 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4341 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4342 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4343 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4344 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4345 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4346 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4347 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4348 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4349 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4350 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4351 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4352 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4353 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4354 Info.opc = ISD::INTRINSIC_VOID; 4355 Info.memVT = MVT::v8f32; 4356 Info.ptrVal = I.getArgOperand(0); 4357 Info.offset = 0; 4358 Info.flags = MachineMemOperand::MOStore; 4359 Info.align = Align(16); 4360 return true; 4361 } 4362 4363 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4364 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4365 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4366 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4367 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4368 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4369 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4370 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4371 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4372 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4373 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4374 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4375 Info.opc = ISD::INTRINSIC_VOID; 4376 Info.memVT = MVT::v8i32; 4377 Info.ptrVal = I.getArgOperand(0); 4378 Info.offset = 0; 4379 Info.flags = MachineMemOperand::MOStore; 4380 Info.align = Align(16); 4381 return true; 4382 } 4383 4384 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4385 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4386 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4387 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4388 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4389 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4390 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4391 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4392 Info.opc = ISD::INTRINSIC_VOID; 4393 Info.memVT = MVT::v2i32; 4394 Info.ptrVal = I.getArgOperand(0); 4395 Info.offset = 0; 4396 Info.flags = MachineMemOperand::MOStore; 4397 Info.align = Align(8); 4398 return true; 4399 } 4400 4401 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4402 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4403 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4404 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4405 Info.opc = ISD::INTRINSIC_VOID; 4406 Info.memVT = MVT::v2f64; 4407 Info.ptrVal = I.getArgOperand(0); 4408 Info.offset = 0; 4409 Info.flags = MachineMemOperand::MOStore; 4410 Info.align = Align(16); 4411 return true; 4412 } 4413 4414 case Intrinsic::nvvm_atomic_load_inc_32: 4415 case Intrinsic::nvvm_atomic_load_dec_32: 4416 4417 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4418 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4419 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4420 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4421 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4422 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4423 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4424 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4425 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4426 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4427 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4428 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4429 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4430 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4431 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4432 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4433 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4434 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4435 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4436 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4437 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4438 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4439 auto &DL = I.getModule()->getDataLayout(); 4440 Info.opc = ISD::INTRINSIC_W_CHAIN; 4441 Info.memVT = getValueType(DL, I.getType()); 4442 Info.ptrVal = I.getArgOperand(0); 4443 Info.offset = 0; 4444 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4445 Info.align.reset(); 4446 return true; 4447 } 4448 4449 case Intrinsic::nvvm_ldu_global_i: 4450 case Intrinsic::nvvm_ldu_global_f: 4451 case Intrinsic::nvvm_ldu_global_p: { 4452 auto &DL = I.getModule()->getDataLayout(); 4453 Info.opc = ISD::INTRINSIC_W_CHAIN; 4454 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4455 Info.memVT = getValueType(DL, I.getType()); 4456 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4457 Info.memVT = getPointerTy(DL); 4458 else 4459 Info.memVT = getValueType(DL, I.getType()); 4460 Info.ptrVal = I.getArgOperand(0); 4461 Info.offset = 0; 4462 Info.flags = MachineMemOperand::MOLoad; 4463 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4464 4465 return true; 4466 } 4467 case Intrinsic::nvvm_ldg_global_i: 4468 case Intrinsic::nvvm_ldg_global_f: 4469 case Intrinsic::nvvm_ldg_global_p: { 4470 auto &DL = I.getModule()->getDataLayout(); 4471 4472 Info.opc = ISD::INTRINSIC_W_CHAIN; 4473 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4474 Info.memVT = getValueType(DL, I.getType()); 4475 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4476 Info.memVT = getPointerTy(DL); 4477 else 4478 Info.memVT = getValueType(DL, I.getType()); 4479 Info.ptrVal = I.getArgOperand(0); 4480 Info.offset = 0; 4481 Info.flags = MachineMemOperand::MOLoad; 4482 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4483 4484 return true; 4485 } 4486 4487 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4488 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4489 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4490 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4491 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4492 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4493 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4494 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4495 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4496 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4497 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4498 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4499 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4500 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4501 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4502 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4503 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4504 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4505 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4506 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4507 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4508 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4509 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4510 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4511 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4512 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4513 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4514 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4515 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4516 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4517 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4518 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4519 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4520 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4521 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4522 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4523 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4524 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4525 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4526 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4527 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4528 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4529 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4530 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4531 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4532 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4533 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4534 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4535 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4536 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4537 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4538 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4539 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4540 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4541 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4542 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4543 Info.opc = getOpcForTextureInstr(Intrinsic); 4544 Info.memVT = MVT::v4f32; 4545 Info.ptrVal = nullptr; 4546 Info.offset = 0; 4547 Info.flags = MachineMemOperand::MOLoad; 4548 Info.align = Align(16); 4549 return true; 4550 4551 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4552 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4553 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4554 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4555 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4556 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4557 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4558 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4559 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4560 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4561 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4562 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4563 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4564 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4565 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4566 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4567 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4568 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4569 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4570 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4571 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4572 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4573 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4574 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4575 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4576 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4577 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4578 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4579 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4580 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4581 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4582 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4583 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4584 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4585 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4586 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4587 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4588 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4589 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4590 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4591 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4592 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4593 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4594 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4595 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4596 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4597 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4598 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4599 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4600 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4601 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4602 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4603 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4604 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4605 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4606 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4607 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4608 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4609 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4610 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4611 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4612 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4613 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4614 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4615 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4616 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4617 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4618 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4619 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4620 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4621 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4622 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4623 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4624 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4625 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4626 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4627 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4628 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4629 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4630 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4631 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4632 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4633 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4634 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4635 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4636 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4637 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4638 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4639 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4640 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4641 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4642 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4643 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4644 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4645 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4646 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4647 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4648 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4649 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4650 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4651 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4652 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4653 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4654 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4655 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4656 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4657 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4658 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4659 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4660 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4661 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4662 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4663 Info.opc = getOpcForTextureInstr(Intrinsic); 4664 Info.memVT = MVT::v4i32; 4665 Info.ptrVal = nullptr; 4666 Info.offset = 0; 4667 Info.flags = MachineMemOperand::MOLoad; 4668 Info.align = Align(16); 4669 return true; 4670 4671 case Intrinsic::nvvm_suld_1d_i8_clamp: 4672 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4673 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4674 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4675 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4676 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4677 case Intrinsic::nvvm_suld_2d_i8_clamp: 4678 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4679 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4680 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4681 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4682 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4683 case Intrinsic::nvvm_suld_3d_i8_clamp: 4684 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4685 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4686 case Intrinsic::nvvm_suld_1d_i8_trap: 4687 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4688 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4689 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4690 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4691 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4692 case Intrinsic::nvvm_suld_2d_i8_trap: 4693 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4694 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4695 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4696 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4697 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4698 case Intrinsic::nvvm_suld_3d_i8_trap: 4699 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4700 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4701 case Intrinsic::nvvm_suld_1d_i8_zero: 4702 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4703 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4704 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4705 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4706 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4707 case Intrinsic::nvvm_suld_2d_i8_zero: 4708 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4709 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4710 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4711 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4712 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4713 case Intrinsic::nvvm_suld_3d_i8_zero: 4714 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4715 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4716 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4717 Info.memVT = MVT::i8; 4718 Info.ptrVal = nullptr; 4719 Info.offset = 0; 4720 Info.flags = MachineMemOperand::MOLoad; 4721 Info.align = Align(16); 4722 return true; 4723 4724 case Intrinsic::nvvm_suld_1d_i16_clamp: 4725 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4726 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4727 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4728 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4729 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4730 case Intrinsic::nvvm_suld_2d_i16_clamp: 4731 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4732 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4733 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4734 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4735 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4736 case Intrinsic::nvvm_suld_3d_i16_clamp: 4737 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4738 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4739 case Intrinsic::nvvm_suld_1d_i16_trap: 4740 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4741 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4742 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4743 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4744 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4745 case Intrinsic::nvvm_suld_2d_i16_trap: 4746 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4747 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4748 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4749 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4750 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4751 case Intrinsic::nvvm_suld_3d_i16_trap: 4752 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4753 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4754 case Intrinsic::nvvm_suld_1d_i16_zero: 4755 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4756 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4757 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4758 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4759 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4760 case Intrinsic::nvvm_suld_2d_i16_zero: 4761 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4762 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4763 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4764 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4765 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4766 case Intrinsic::nvvm_suld_3d_i16_zero: 4767 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4768 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4769 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4770 Info.memVT = MVT::i16; 4771 Info.ptrVal = nullptr; 4772 Info.offset = 0; 4773 Info.flags = MachineMemOperand::MOLoad; 4774 Info.align = Align(16); 4775 return true; 4776 4777 case Intrinsic::nvvm_suld_1d_i32_clamp: 4778 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4779 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4780 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4781 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4782 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4783 case Intrinsic::nvvm_suld_2d_i32_clamp: 4784 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4785 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4786 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4787 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4788 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4789 case Intrinsic::nvvm_suld_3d_i32_clamp: 4790 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4791 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4792 case Intrinsic::nvvm_suld_1d_i32_trap: 4793 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4794 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4795 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4796 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4797 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4798 case Intrinsic::nvvm_suld_2d_i32_trap: 4799 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4800 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4801 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4802 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4803 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4804 case Intrinsic::nvvm_suld_3d_i32_trap: 4805 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4806 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4807 case Intrinsic::nvvm_suld_1d_i32_zero: 4808 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4809 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4810 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4811 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4812 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4813 case Intrinsic::nvvm_suld_2d_i32_zero: 4814 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4815 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4816 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4817 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4818 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4819 case Intrinsic::nvvm_suld_3d_i32_zero: 4820 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4821 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4822 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4823 Info.memVT = MVT::i32; 4824 Info.ptrVal = nullptr; 4825 Info.offset = 0; 4826 Info.flags = MachineMemOperand::MOLoad; 4827 Info.align = Align(16); 4828 return true; 4829 4830 case Intrinsic::nvvm_suld_1d_i64_clamp: 4831 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4832 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4833 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4834 case Intrinsic::nvvm_suld_2d_i64_clamp: 4835 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4836 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4837 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4838 case Intrinsic::nvvm_suld_3d_i64_clamp: 4839 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4840 case Intrinsic::nvvm_suld_1d_i64_trap: 4841 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4842 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4843 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4844 case Intrinsic::nvvm_suld_2d_i64_trap: 4845 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4846 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4847 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4848 case Intrinsic::nvvm_suld_3d_i64_trap: 4849 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4850 case Intrinsic::nvvm_suld_1d_i64_zero: 4851 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4852 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4853 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4854 case Intrinsic::nvvm_suld_2d_i64_zero: 4855 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4856 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4857 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4858 case Intrinsic::nvvm_suld_3d_i64_zero: 4859 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4860 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4861 Info.memVT = MVT::i64; 4862 Info.ptrVal = nullptr; 4863 Info.offset = 0; 4864 Info.flags = MachineMemOperand::MOLoad; 4865 Info.align = Align(16); 4866 return true; 4867 } 4868 return false; 4869 } 4870 4871 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4872 /// .param space, we may want to increase their alignment in a way that 4873 /// ensures that we can effectively vectorize their loads & stores. We can 4874 /// increase alignment only if the function has internal or has private 4875 /// linkage as for other linkage types callers may already rely on default 4876 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4877 /// ensures that alignment is 16 or greater. 4878 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4879 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4880 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4881 4882 // If a function has linkage different from internal or private, we 4883 // must use default ABI alignment as external users rely on it. Same 4884 // for a function that may be called from a function pointer. 4885 if (!F || !F->hasLocalLinkage() || 4886 F->hasAddressTaken(/*Users=*/nullptr, 4887 /*IgnoreCallbackUses=*/false, 4888 /*IgnoreAssumeLikeCalls=*/true, 4889 /*IgnoreLLVMUsed=*/true)) 4890 return Align(ABITypeAlign); 4891 4892 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4893 return Align(std::max(uint64_t(16), ABITypeAlign)); 4894 } 4895 4896 /// Helper for computing alignment of a device function byval parameter. 4897 Align NVPTXTargetLowering::getFunctionByValParamAlign( 4898 const Function *F, Type *ArgTy, Align InitialAlign, 4899 const DataLayout &DL) const { 4900 Align ArgAlign = InitialAlign; 4901 // Try to increase alignment to enhance vectorization options. 4902 if (F) 4903 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 4904 4905 // Old ptx versions have a bug. When PTX code takes address of 4906 // byval parameter with alignment < 4, ptxas generates code to 4907 // spill argument into memory. Alas on sm_50+ ptxas generates 4908 // SASS code that fails with misaligned access. To work around 4909 // the problem, make sure that we align byval parameters by at 4910 // least 4. This bug seems to be fixed at least starting from 4911 // ptxas > 9.0. 4912 // TODO: remove this after verifying the bug is not reproduced 4913 // on non-deprecated ptxas versions. 4914 if (ForceMinByValParamAlign) 4915 ArgAlign = std::max(ArgAlign, Align(4)); 4916 4917 return ArgAlign; 4918 } 4919 4920 // Helper for getting a function parameter name. Name is composed from 4921 // its index and the function name. Negative index corresponds to special 4922 // parameter (unsized array) used for passing variable arguments. 4923 std::string NVPTXTargetLowering::getParamName(const Function *F, 4924 int Idx) const { 4925 std::string ParamName; 4926 raw_string_ostream ParamStr(ParamName); 4927 4928 ParamStr << getTargetMachine().getSymbol(F)->getName(); 4929 if (Idx < 0) 4930 ParamStr << "_vararg"; 4931 else 4932 ParamStr << "_param_" << Idx; 4933 4934 return ParamName; 4935 } 4936 4937 /// isLegalAddressingMode - Return true if the addressing mode represented 4938 /// by AM is legal for this target, for a load/store of the specified type. 4939 /// Used to guide target specific optimizations, like loop strength reduction 4940 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4941 /// (CodeGenPrepare.cpp) 4942 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4943 const AddrMode &AM, Type *Ty, 4944 unsigned AS, Instruction *I) const { 4945 // AddrMode - This represents an addressing mode of: 4946 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4947 // 4948 // The legal address modes are 4949 // - [avar] 4950 // - [areg] 4951 // - [areg+immoff] 4952 // - [immAddr] 4953 4954 if (AM.BaseGV) { 4955 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4956 } 4957 4958 switch (AM.Scale) { 4959 case 0: // "r", "r+i" or "i" is allowed 4960 break; 4961 case 1: 4962 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4963 return false; 4964 // Otherwise we have r+i. 4965 break; 4966 default: 4967 // No scale > 1 is allowed 4968 return false; 4969 } 4970 return true; 4971 } 4972 4973 //===----------------------------------------------------------------------===// 4974 // NVPTX Inline Assembly Support 4975 //===----------------------------------------------------------------------===// 4976 4977 /// getConstraintType - Given a constraint letter, return the type of 4978 /// constraint it is for this target. 4979 NVPTXTargetLowering::ConstraintType 4980 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4981 if (Constraint.size() == 1) { 4982 switch (Constraint[0]) { 4983 default: 4984 break; 4985 case 'b': 4986 case 'r': 4987 case 'h': 4988 case 'c': 4989 case 'l': 4990 case 'f': 4991 case 'd': 4992 case '0': 4993 case 'N': 4994 return C_RegisterClass; 4995 } 4996 } 4997 return TargetLowering::getConstraintType(Constraint); 4998 } 4999 5000 std::pair<unsigned, const TargetRegisterClass *> 5001 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 5002 StringRef Constraint, 5003 MVT VT) const { 5004 if (Constraint.size() == 1) { 5005 switch (Constraint[0]) { 5006 case 'b': 5007 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 5008 case 'c': 5009 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5010 case 'h': 5011 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5012 case 'r': 5013 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 5014 case 'l': 5015 case 'N': 5016 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 5017 case 'f': 5018 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 5019 case 'd': 5020 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 5021 } 5022 } 5023 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5024 } 5025 5026 //===----------------------------------------------------------------------===// 5027 // NVPTX DAG Combining 5028 //===----------------------------------------------------------------------===// 5029 5030 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 5031 CodeGenOptLevel OptLevel) const { 5032 // Always honor command-line argument 5033 if (FMAContractLevelOpt.getNumOccurrences() > 0) 5034 return FMAContractLevelOpt > 0; 5035 5036 // Do not contract if we're not optimizing the code. 5037 if (OptLevel == CodeGenOptLevel::None) 5038 return false; 5039 5040 // Honor TargetOptions flags that explicitly say fusion is okay. 5041 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 5042 return true; 5043 5044 return allowUnsafeFPMath(MF); 5045 } 5046 5047 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 5048 // Honor TargetOptions flags that explicitly say unsafe math is okay. 5049 if (MF.getTarget().Options.UnsafeFPMath) 5050 return true; 5051 5052 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 5053 const Function &F = MF.getFunction(); 5054 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 5055 } 5056 5057 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 5058 /// operands N0 and N1. This is a helper for PerformADDCombine that is 5059 /// called with the default operands, and if that fails, with commuted 5060 /// operands. 5061 static SDValue PerformADDCombineWithOperands( 5062 SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, 5063 const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) { 5064 SelectionDAG &DAG = DCI.DAG; 5065 // Skip non-integer, non-scalar case 5066 EVT VT=N0.getValueType(); 5067 if (VT.isVector()) 5068 return SDValue(); 5069 5070 // fold (add (mul a, b), c) -> (mad a, b, c) 5071 // 5072 if (N0.getOpcode() == ISD::MUL) { 5073 assert (VT.isInteger()); 5074 // For integer: 5075 // Since integer multiply-add costs the same as integer multiply 5076 // but is more costly than integer add, do the fusion only when 5077 // the mul is only used in the add. 5078 if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 || 5079 !N0.getNode()->hasOneUse()) 5080 return SDValue(); 5081 5082 // Do the folding 5083 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 5084 N0.getOperand(0), N0.getOperand(1), N1); 5085 } 5086 else if (N0.getOpcode() == ISD::FMUL) { 5087 if (VT == MVT::f32 || VT == MVT::f64) { 5088 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 5089 &DAG.getTargetLoweringInfo()); 5090 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 5091 return SDValue(); 5092 5093 // For floating point: 5094 // Do the fusion only when the mul has less than 5 uses and all 5095 // are add. 5096 // The heuristic is that if a use is not an add, then that use 5097 // cannot be fused into fma, therefore mul is still needed anyway. 5098 // If there are more than 4 uses, even if they are all add, fusing 5099 // them will increase register pressue. 5100 // 5101 int numUses = 0; 5102 int nonAddCount = 0; 5103 for (const SDNode *User : N0.getNode()->uses()) { 5104 numUses++; 5105 if (User->getOpcode() != ISD::FADD) 5106 ++nonAddCount; 5107 } 5108 if (numUses >= 5) 5109 return SDValue(); 5110 if (nonAddCount) { 5111 int orderNo = N->getIROrder(); 5112 int orderNo2 = N0.getNode()->getIROrder(); 5113 // simple heuristics here for considering potential register 5114 // pressure, the logics here is that the differnce are used 5115 // to measure the distance between def and use, the longer distance 5116 // more likely cause register pressure. 5117 if (orderNo - orderNo2 < 500) 5118 return SDValue(); 5119 5120 // Now, check if at least one of the FMUL's operands is live beyond the node N, 5121 // which guarantees that the FMA will not increase register pressure at node N. 5122 bool opIsLive = false; 5123 const SDNode *left = N0.getOperand(0).getNode(); 5124 const SDNode *right = N0.getOperand(1).getNode(); 5125 5126 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 5127 opIsLive = true; 5128 5129 if (!opIsLive) 5130 for (const SDNode *User : left->uses()) { 5131 int orderNo3 = User->getIROrder(); 5132 if (orderNo3 > orderNo) { 5133 opIsLive = true; 5134 break; 5135 } 5136 } 5137 5138 if (!opIsLive) 5139 for (const SDNode *User : right->uses()) { 5140 int orderNo3 = User->getIROrder(); 5141 if (orderNo3 > orderNo) { 5142 opIsLive = true; 5143 break; 5144 } 5145 } 5146 5147 if (!opIsLive) 5148 return SDValue(); 5149 } 5150 5151 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 5152 N0.getOperand(0), N0.getOperand(1), N1); 5153 } 5154 } 5155 5156 return SDValue(); 5157 } 5158 5159 static SDValue PerformStoreRetvalCombine(SDNode *N) { 5160 // Operands from the 2nd to the last one are the values to be stored 5161 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 5162 if (!N->getOperand(I).isUndef()) 5163 return SDValue(); 5164 5165 // Operand 0 is the previous value in the chain. Cannot return EntryToken 5166 // as the previous value will become unused and eliminated later. 5167 return N->getOperand(0); 5168 } 5169 5170 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 5171 /// 5172 static SDValue PerformADDCombine(SDNode *N, 5173 TargetLowering::DAGCombinerInfo &DCI, 5174 const NVPTXSubtarget &Subtarget, 5175 CodeGenOptLevel OptLevel) { 5176 SDValue N0 = N->getOperand(0); 5177 SDValue N1 = N->getOperand(1); 5178 5179 // First try with the default operand order. 5180 if (SDValue Result = 5181 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 5182 return Result; 5183 5184 // If that didn't work, try again with the operands commuted. 5185 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 5186 } 5187 5188 static SDValue PerformANDCombine(SDNode *N, 5189 TargetLowering::DAGCombinerInfo &DCI) { 5190 // The type legalizer turns a vector load of i8 values into a zextload to i16 5191 // registers, optionally ANY_EXTENDs it (if target type is integer), 5192 // and ANDs off the high 8 bits. Since we turn this load into a 5193 // target-specific DAG node, the DAG combiner fails to eliminate these AND 5194 // nodes. Do that here. 5195 SDValue Val = N->getOperand(0); 5196 SDValue Mask = N->getOperand(1); 5197 5198 if (isa<ConstantSDNode>(Val)) { 5199 std::swap(Val, Mask); 5200 } 5201 5202 SDValue AExt; 5203 5204 // Convert BFE-> truncate i16 -> and 255 5205 // To just BFE-> truncate i16, as the value already has all the bits in the 5206 // right places. 5207 if (Val.getOpcode() == ISD::TRUNCATE) { 5208 SDValue BFE = Val.getOperand(0); 5209 if (BFE.getOpcode() != NVPTXISD::BFE) 5210 return SDValue(); 5211 5212 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); 5213 if (!BFEBits) 5214 return SDValue(); 5215 uint64_t BFEBitsVal = BFEBits->getZExtValue(); 5216 5217 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5218 if (!MaskCnst) { 5219 // Not an AND with a constant 5220 return SDValue(); 5221 } 5222 uint64_t MaskVal = MaskCnst->getZExtValue(); 5223 5224 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) 5225 return SDValue(); 5226 // If we get here, the AND is unnecessary. Just replace it with the trunc 5227 DCI.CombineTo(N, Val, false); 5228 } 5229 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 5230 if (Val.getOpcode() == ISD::ANY_EXTEND) { 5231 AExt = Val; 5232 Val = Val->getOperand(0); 5233 } 5234 5235 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 5236 Val = Val->getOperand(0); 5237 } 5238 5239 if (Val->getOpcode() == NVPTXISD::LoadV2 || 5240 Val->getOpcode() == NVPTXISD::LoadV4) { 5241 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5242 if (!MaskCnst) { 5243 // Not an AND with a constant 5244 return SDValue(); 5245 } 5246 5247 uint64_t MaskVal = MaskCnst->getZExtValue(); 5248 if (MaskVal != 0xff) { 5249 // Not an AND that chops off top 8 bits 5250 return SDValue(); 5251 } 5252 5253 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 5254 if (!Mem) { 5255 // Not a MemSDNode?!? 5256 return SDValue(); 5257 } 5258 5259 EVT MemVT = Mem->getMemoryVT(); 5260 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 5261 // We only handle the i8 case 5262 return SDValue(); 5263 } 5264 5265 unsigned ExtType = 5266 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 5267 getZExtValue(); 5268 if (ExtType == ISD::SEXTLOAD) { 5269 // If for some reason the load is a sextload, the and is needed to zero 5270 // out the high 8 bits 5271 return SDValue(); 5272 } 5273 5274 bool AddTo = false; 5275 if (AExt.getNode() != nullptr) { 5276 // Re-insert the ext as a zext. 5277 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5278 AExt.getValueType(), Val); 5279 AddTo = true; 5280 } 5281 5282 // If we get here, the AND is unnecessary. Just replace it with the load 5283 DCI.CombineTo(N, Val, AddTo); 5284 } 5285 5286 return SDValue(); 5287 } 5288 5289 static SDValue PerformREMCombine(SDNode *N, 5290 TargetLowering::DAGCombinerInfo &DCI, 5291 CodeGenOptLevel OptLevel) { 5292 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 5293 5294 // Don't do anything at less than -O2. 5295 if (OptLevel < CodeGenOptLevel::Default) 5296 return SDValue(); 5297 5298 SelectionDAG &DAG = DCI.DAG; 5299 SDLoc DL(N); 5300 EVT VT = N->getValueType(0); 5301 bool IsSigned = N->getOpcode() == ISD::SREM; 5302 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 5303 5304 const SDValue &Num = N->getOperand(0); 5305 const SDValue &Den = N->getOperand(1); 5306 5307 for (const SDNode *U : Num->uses()) { 5308 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 5309 U->getOperand(1) == Den) { 5310 // Num % Den -> Num - (Num / Den) * Den 5311 return DAG.getNode(ISD::SUB, DL, VT, Num, 5312 DAG.getNode(ISD::MUL, DL, VT, 5313 DAG.getNode(DivOpc, DL, VT, Num, Den), 5314 Den)); 5315 } 5316 } 5317 return SDValue(); 5318 } 5319 5320 enum OperandSignedness { 5321 Signed = 0, 5322 Unsigned, 5323 Unknown 5324 }; 5325 5326 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 5327 /// that can be demoted to \p OptSize bits without loss of information. The 5328 /// signedness of the operand, if determinable, is placed in \p S. 5329 static bool IsMulWideOperandDemotable(SDValue Op, 5330 unsigned OptSize, 5331 OperandSignedness &S) { 5332 S = Unknown; 5333 5334 if (Op.getOpcode() == ISD::SIGN_EXTEND || 5335 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 5336 EVT OrigVT = Op.getOperand(0).getValueType(); 5337 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5338 S = Signed; 5339 return true; 5340 } 5341 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 5342 EVT OrigVT = Op.getOperand(0).getValueType(); 5343 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5344 S = Unsigned; 5345 return true; 5346 } 5347 } 5348 5349 return false; 5350 } 5351 5352 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 5353 /// be demoted to \p OptSize bits without loss of information. If the operands 5354 /// contain a constant, it should appear as the RHS operand. The signedness of 5355 /// the operands is placed in \p IsSigned. 5356 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 5357 unsigned OptSize, 5358 bool &IsSigned) { 5359 OperandSignedness LHSSign; 5360 5361 // The LHS operand must be a demotable op 5362 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5363 return false; 5364 5365 // We should have been able to determine the signedness from the LHS 5366 if (LHSSign == Unknown) 5367 return false; 5368 5369 IsSigned = (LHSSign == Signed); 5370 5371 // The RHS can be a demotable op or a constant 5372 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5373 const APInt &Val = CI->getAPIntValue(); 5374 if (LHSSign == Unsigned) { 5375 return Val.isIntN(OptSize); 5376 } else { 5377 return Val.isSignedIntN(OptSize); 5378 } 5379 } else { 5380 OperandSignedness RHSSign; 5381 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5382 return false; 5383 5384 return LHSSign == RHSSign; 5385 } 5386 } 5387 5388 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5389 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5390 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5391 /// amount. 5392 static SDValue TryMULWIDECombine(SDNode *N, 5393 TargetLowering::DAGCombinerInfo &DCI) { 5394 EVT MulType = N->getValueType(0); 5395 if (MulType != MVT::i32 && MulType != MVT::i64) { 5396 return SDValue(); 5397 } 5398 5399 SDLoc DL(N); 5400 unsigned OptSize = MulType.getSizeInBits() >> 1; 5401 SDValue LHS = N->getOperand(0); 5402 SDValue RHS = N->getOperand(1); 5403 5404 // Canonicalize the multiply so the constant (if any) is on the right 5405 if (N->getOpcode() == ISD::MUL) { 5406 if (isa<ConstantSDNode>(LHS)) { 5407 std::swap(LHS, RHS); 5408 } 5409 } 5410 5411 // If we have a SHL, determine the actual multiply amount 5412 if (N->getOpcode() == ISD::SHL) { 5413 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5414 if (!ShlRHS) { 5415 return SDValue(); 5416 } 5417 5418 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5419 unsigned BitWidth = MulType.getSizeInBits(); 5420 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5421 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5422 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5423 } else { 5424 return SDValue(); 5425 } 5426 } 5427 5428 bool Signed; 5429 // Verify that our operands are demotable 5430 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5431 return SDValue(); 5432 } 5433 5434 EVT DemotedVT; 5435 if (MulType == MVT::i32) { 5436 DemotedVT = MVT::i16; 5437 } else { 5438 DemotedVT = MVT::i32; 5439 } 5440 5441 // Truncate the operands to the correct size. Note that these are just for 5442 // type consistency and will (likely) be eliminated in later phases. 5443 SDValue TruncLHS = 5444 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5445 SDValue TruncRHS = 5446 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5447 5448 unsigned Opc; 5449 if (Signed) { 5450 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5451 } else { 5452 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5453 } 5454 5455 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5456 } 5457 5458 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5459 static SDValue PerformMULCombine(SDNode *N, 5460 TargetLowering::DAGCombinerInfo &DCI, 5461 CodeGenOptLevel OptLevel) { 5462 if (OptLevel > CodeGenOptLevel::None) { 5463 // Try mul.wide combining at OptLevel > 0 5464 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5465 return Ret; 5466 } 5467 5468 return SDValue(); 5469 } 5470 5471 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5472 static SDValue PerformSHLCombine(SDNode *N, 5473 TargetLowering::DAGCombinerInfo &DCI, 5474 CodeGenOptLevel OptLevel) { 5475 if (OptLevel > CodeGenOptLevel::None) { 5476 // Try mul.wide combining at OptLevel > 0 5477 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5478 return Ret; 5479 } 5480 5481 return SDValue(); 5482 } 5483 5484 static SDValue PerformSETCCCombine(SDNode *N, 5485 TargetLowering::DAGCombinerInfo &DCI, 5486 unsigned int SmVersion) { 5487 EVT CCType = N->getValueType(0); 5488 SDValue A = N->getOperand(0); 5489 SDValue B = N->getOperand(1); 5490 5491 EVT AType = A.getValueType(); 5492 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) 5493 return SDValue(); 5494 5495 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) 5496 return SDValue(); 5497 5498 SDLoc DL(N); 5499 // setp.f16x2 returns two scalar predicates, which we need to 5500 // convert back to v2i1. The returned result will be scalarized by 5501 // the legalizer, but the comparison will remain a single vector 5502 // instruction. 5503 SDValue CCNode = DCI.DAG.getNode( 5504 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 5505 : NVPTXISD::SETP_BF16X2, 5506 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); 5507 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5508 CCNode.getValue(1)); 5509 } 5510 5511 static SDValue PerformEXTRACTCombine(SDNode *N, 5512 TargetLowering::DAGCombinerInfo &DCI) { 5513 SDValue Vector = N->getOperand(0); 5514 SDLoc DL(N); 5515 EVT VectorVT = Vector.getValueType(); 5516 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && 5517 IsPTXVectorType(VectorVT.getSimpleVT())) 5518 return SDValue(); // Native vector loads already combine nicely w/ 5519 // extract_vector_elt, except for v4i8. 5520 // Don't mess with singletons or v2*16 types, we already handle them OK. 5521 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || 5522 VectorVT == MVT::v4i8) 5523 return SDValue(); 5524 5525 uint64_t VectorBits = VectorVT.getSizeInBits(); 5526 // We only handle the types we can extract in-register. 5527 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) 5528 return SDValue(); 5529 5530 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5531 // Index == 0 is handled by generic DAG combiner. 5532 if (!Index || Index->getZExtValue() == 0) 5533 return SDValue(); 5534 5535 MVT IVT = MVT::getIntegerVT(VectorBits); 5536 EVT EltVT = VectorVT.getVectorElementType(); 5537 EVT EltIVT = EltVT.changeTypeToInteger(); 5538 uint64_t EltBits = EltVT.getScalarSizeInBits(); 5539 5540 SDValue Result = DCI.DAG.getNode( 5541 ISD::TRUNCATE, DL, EltIVT, 5542 DCI.DAG.getNode( 5543 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector), 5544 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); 5545 5546 // If element has non-integer type, bitcast it back to the expected type. 5547 if (EltVT != EltIVT) 5548 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); 5549 // Past legalizer, we may need to extent i8 -> i16 to match the register type. 5550 if (EltVT != N->getValueType(0)) 5551 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); 5552 5553 return Result; 5554 } 5555 5556 static SDValue PerformVSELECTCombine(SDNode *N, 5557 TargetLowering::DAGCombinerInfo &DCI) { 5558 SDValue VA = N->getOperand(1); 5559 EVT VectorVT = VA.getValueType(); 5560 if (VectorVT != MVT::v4i8) 5561 return SDValue(); 5562 5563 // We need to split vselect into individual per-element operations Because we 5564 // use BFE/BFI instruction for byte extraction/insertion, we do end up with 5565 // 32-bit values, so we may as well do comparison as i32 to avoid conversions 5566 // to/from i16 normally used for i8 values. 5567 SmallVector<SDValue, 4> E; 5568 SDLoc DL(N); 5569 SDValue VCond = N->getOperand(0); 5570 SDValue VB = N->getOperand(2); 5571 for (int I = 0; I < 4; ++I) { 5572 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, 5573 DCI.DAG.getConstant(I, DL, MVT::i32)); 5574 SDValue EA = DCI.DAG.getAnyExtOrTrunc( 5575 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, 5576 DCI.DAG.getConstant(I, DL, MVT::i32)), 5577 DL, MVT::i32); 5578 SDValue EB = DCI.DAG.getAnyExtOrTrunc( 5579 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, 5580 DCI.DAG.getConstant(I, DL, MVT::i32)), 5581 DL, MVT::i32); 5582 E.push_back(DCI.DAG.getAnyExtOrTrunc( 5583 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); 5584 } 5585 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); 5586 } 5587 5588 static SDValue PerformLOADCombine(SDNode *N, 5589 TargetLowering::DAGCombinerInfo &DCI) { 5590 SelectionDAG &DAG = DCI.DAG; 5591 LoadSDNode *LD = cast<LoadSDNode>(N); 5592 5593 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of 5594 // letting ReplaceLoadVector split it into smaller loads during legalization. 5595 // This is done at dag-combine1 time, so that vector operations with i8 5596 // elements can be optimised away instead of being needlessly split during 5597 // legalization, which involves storing to the stack and loading it back. 5598 EVT VT = N->getValueType(0); 5599 if (VT != MVT::v16i8) 5600 return SDValue(); 5601 5602 SDLoc DL(N); 5603 5604 // Create a v4i32 vector load operation, effectively <4 x v4i8>. 5605 unsigned Opc = NVPTXISD::LoadV4; 5606 EVT NewVT = MVT::v4i32; 5607 EVT EltVT = NewVT.getVectorElementType(); 5608 unsigned NumElts = NewVT.getVectorNumElements(); 5609 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; 5610 SDVTList RetVTList = DAG.getVTList(RetVTs); 5611 SmallVector<SDValue, 8> Ops(N->ops()); 5612 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5613 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, 5614 LD->getMemOperand()); 5615 SDValue NewChain = NewLoad.getValue(NumElts); 5616 5617 // Create a vector of the same type returned by the original load. 5618 SmallVector<SDValue, 4> Elts; 5619 for (unsigned i = 0; i < NumElts; i++) 5620 Elts.push_back(NewLoad.getValue(i)); 5621 return DCI.DAG.getMergeValues( 5622 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), 5623 NewChain}, 5624 DL); 5625 } 5626 5627 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 5628 DAGCombinerInfo &DCI) const { 5629 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); 5630 switch (N->getOpcode()) { 5631 default: break; 5632 case ISD::ADD: 5633 case ISD::FADD: 5634 return PerformADDCombine(N, DCI, STI, OptLevel); 5635 case ISD::MUL: 5636 return PerformMULCombine(N, DCI, OptLevel); 5637 case ISD::SHL: 5638 return PerformSHLCombine(N, DCI, OptLevel); 5639 case ISD::AND: 5640 return PerformANDCombine(N, DCI); 5641 case ISD::UREM: 5642 case ISD::SREM: 5643 return PerformREMCombine(N, DCI, OptLevel); 5644 case ISD::SETCC: 5645 return PerformSETCCCombine(N, DCI, STI.getSmVersion()); 5646 case ISD::LOAD: 5647 return PerformLOADCombine(N, DCI); 5648 case NVPTXISD::StoreRetval: 5649 case NVPTXISD::StoreRetvalV2: 5650 case NVPTXISD::StoreRetvalV4: 5651 return PerformStoreRetvalCombine(N); 5652 case ISD::EXTRACT_VECTOR_ELT: 5653 return PerformEXTRACTCombine(N, DCI); 5654 case ISD::VSELECT: 5655 return PerformVSELECTCombine(N, DCI); 5656 } 5657 return SDValue(); 5658 } 5659 5660 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5661 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 5662 SmallVectorImpl<SDValue> &Results) { 5663 EVT ResVT = N->getValueType(0); 5664 SDLoc DL(N); 5665 5666 assert(ResVT.isVector() && "Vector load must have vector type"); 5667 5668 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 5669 // legal. We can (and should) split that into 2 loads of <2 x double> here 5670 // but I'm leaving that as a TODO for now. 5671 assert(ResVT.isSimple() && "Can only handle simple types"); 5672 switch (ResVT.getSimpleVT().SimpleTy) { 5673 default: 5674 return; 5675 case MVT::v2i8: 5676 case MVT::v2i16: 5677 case MVT::v2i32: 5678 case MVT::v2i64: 5679 case MVT::v2f16: 5680 case MVT::v2f32: 5681 case MVT::v2f64: 5682 case MVT::v4i8: 5683 case MVT::v4i16: 5684 case MVT::v4i32: 5685 case MVT::v4f16: 5686 case MVT::v4f32: 5687 case MVT::v8f16: // <4 x f16x2> 5688 case MVT::v8bf16: // <4 x bf16x2> 5689 case MVT::v8i16: // <4 x i16x2> 5690 // This is a "native" vector type 5691 break; 5692 } 5693 5694 LoadSDNode *LD = cast<LoadSDNode>(N); 5695 5696 Align Alignment = LD->getAlign(); 5697 auto &TD = DAG.getDataLayout(); 5698 Align PrefAlign = 5699 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 5700 if (Alignment < PrefAlign) { 5701 // This load is not sufficiently aligned, so bail out and let this vector 5702 // load be scalarized. Note that we may still be able to emit smaller 5703 // vector loads. For example, if we are loading a <4 x float> with an 5704 // alignment of 8, this check will fail but the legalizer will try again 5705 // with 2 x <2 x float>, which will succeed with an alignment of 8. 5706 return; 5707 } 5708 5709 EVT EltVT = ResVT.getVectorElementType(); 5710 unsigned NumElts = ResVT.getVectorNumElements(); 5711 5712 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5713 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5714 // loaded type to i16 and propagate the "real" type as the memory type. 5715 bool NeedTrunc = false; 5716 if (EltVT.getSizeInBits() < 16) { 5717 EltVT = MVT::i16; 5718 NeedTrunc = true; 5719 } 5720 5721 unsigned Opcode = 0; 5722 SDVTList LdResVTs; 5723 bool Load16x2 = false; 5724 5725 switch (NumElts) { 5726 default: 5727 return; 5728 case 2: 5729 Opcode = NVPTXISD::LoadV2; 5730 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5731 break; 5732 case 4: { 5733 Opcode = NVPTXISD::LoadV4; 5734 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5735 LdResVTs = DAG.getVTList(ListVTs); 5736 break; 5737 } 5738 case 8: { 5739 // v8f16 is a special case. PTX doesn't have ld.v8.f16 5740 // instruction. Instead, we split the vector into v2f16 chunks and 5741 // load them with ld.v4.b32. 5742 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); 5743 Load16x2 = true; 5744 Opcode = NVPTXISD::LoadV4; 5745 EVT VVT; 5746 switch (EltVT.getSimpleVT().SimpleTy) { 5747 case MVT::f16: 5748 VVT = MVT::v2f16; 5749 break; 5750 case MVT::bf16: 5751 VVT = MVT::v2bf16; 5752 break; 5753 case MVT::i16: 5754 VVT = MVT::v2i16; 5755 break; 5756 default: 5757 llvm_unreachable("Unsupported v8 vector type."); 5758 } 5759 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 5760 LdResVTs = DAG.getVTList(ListVTs); 5761 break; 5762 } 5763 } 5764 5765 // Copy regular operands 5766 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5767 5768 // The select routine does not have access to the LoadSDNode instance, so 5769 // pass along the extension information 5770 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5771 5772 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5773 LD->getMemoryVT(), 5774 LD->getMemOperand()); 5775 5776 SmallVector<SDValue, 8> ScalarRes; 5777 if (Load16x2) { 5778 // Split v2f16 subvectors back into individual elements. 5779 NumElts /= 2; 5780 for (unsigned i = 0; i < NumElts; ++i) { 5781 SDValue SubVector = NewLD.getValue(i); 5782 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5783 DAG.getIntPtrConstant(0, DL)); 5784 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5785 DAG.getIntPtrConstant(1, DL)); 5786 ScalarRes.push_back(E0); 5787 ScalarRes.push_back(E1); 5788 } 5789 } else { 5790 for (unsigned i = 0; i < NumElts; ++i) { 5791 SDValue Res = NewLD.getValue(i); 5792 if (NeedTrunc) 5793 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5794 ScalarRes.push_back(Res); 5795 } 5796 } 5797 5798 SDValue LoadChain = NewLD.getValue(NumElts); 5799 5800 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5801 5802 Results.push_back(BuildVec); 5803 Results.push_back(LoadChain); 5804 } 5805 5806 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5807 SmallVectorImpl<SDValue> &Results) { 5808 SDValue Chain = N->getOperand(0); 5809 SDValue Intrin = N->getOperand(1); 5810 SDLoc DL(N); 5811 5812 // Get the intrinsic ID 5813 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); 5814 switch (IntrinNo) { 5815 default: 5816 return; 5817 case Intrinsic::nvvm_ldg_global_i: 5818 case Intrinsic::nvvm_ldg_global_f: 5819 case Intrinsic::nvvm_ldg_global_p: 5820 case Intrinsic::nvvm_ldu_global_i: 5821 case Intrinsic::nvvm_ldu_global_f: 5822 case Intrinsic::nvvm_ldu_global_p: { 5823 EVT ResVT = N->getValueType(0); 5824 5825 if (ResVT.isVector()) { 5826 // Vector LDG/LDU 5827 5828 unsigned NumElts = ResVT.getVectorNumElements(); 5829 EVT EltVT = ResVT.getVectorElementType(); 5830 5831 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5832 // legalization. 5833 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5834 // loaded type to i16 and propagate the "real" type as the memory type. 5835 bool NeedTrunc = false; 5836 if (EltVT.getSizeInBits() < 16) { 5837 EltVT = MVT::i16; 5838 NeedTrunc = true; 5839 } 5840 5841 unsigned Opcode = 0; 5842 SDVTList LdResVTs; 5843 5844 switch (NumElts) { 5845 default: 5846 return; 5847 case 2: 5848 switch (IntrinNo) { 5849 default: 5850 return; 5851 case Intrinsic::nvvm_ldg_global_i: 5852 case Intrinsic::nvvm_ldg_global_f: 5853 case Intrinsic::nvvm_ldg_global_p: 5854 Opcode = NVPTXISD::LDGV2; 5855 break; 5856 case Intrinsic::nvvm_ldu_global_i: 5857 case Intrinsic::nvvm_ldu_global_f: 5858 case Intrinsic::nvvm_ldu_global_p: 5859 Opcode = NVPTXISD::LDUV2; 5860 break; 5861 } 5862 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5863 break; 5864 case 4: { 5865 switch (IntrinNo) { 5866 default: 5867 return; 5868 case Intrinsic::nvvm_ldg_global_i: 5869 case Intrinsic::nvvm_ldg_global_f: 5870 case Intrinsic::nvvm_ldg_global_p: 5871 Opcode = NVPTXISD::LDGV4; 5872 break; 5873 case Intrinsic::nvvm_ldu_global_i: 5874 case Intrinsic::nvvm_ldu_global_f: 5875 case Intrinsic::nvvm_ldu_global_p: 5876 Opcode = NVPTXISD::LDUV4; 5877 break; 5878 } 5879 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5880 LdResVTs = DAG.getVTList(ListVTs); 5881 break; 5882 } 5883 } 5884 5885 SmallVector<SDValue, 8> OtherOps; 5886 5887 // Copy regular operands 5888 5889 OtherOps.push_back(Chain); // Chain 5890 // Skip operand 1 (intrinsic ID) 5891 // Others 5892 OtherOps.append(N->op_begin() + 2, N->op_end()); 5893 5894 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5895 5896 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5897 MemSD->getMemoryVT(), 5898 MemSD->getMemOperand()); 5899 5900 SmallVector<SDValue, 4> ScalarRes; 5901 5902 for (unsigned i = 0; i < NumElts; ++i) { 5903 SDValue Res = NewLD.getValue(i); 5904 if (NeedTrunc) 5905 Res = 5906 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5907 ScalarRes.push_back(Res); 5908 } 5909 5910 SDValue LoadChain = NewLD.getValue(NumElts); 5911 5912 SDValue BuildVec = 5913 DAG.getBuildVector(ResVT, DL, ScalarRes); 5914 5915 Results.push_back(BuildVec); 5916 Results.push_back(LoadChain); 5917 } else { 5918 // i8 LDG/LDU 5919 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5920 "Custom handling of non-i8 ldu/ldg?"); 5921 5922 // Just copy all operands as-is 5923 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5924 5925 // Force output to i16 5926 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5927 5928 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5929 5930 // We make sure the memory type is i8, which will be used during isel 5931 // to select the proper instruction. 5932 SDValue NewLD = 5933 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5934 MVT::i8, MemSD->getMemOperand()); 5935 5936 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5937 NewLD.getValue(0))); 5938 Results.push_back(NewLD.getValue(1)); 5939 } 5940 } 5941 } 5942 } 5943 5944 void NVPTXTargetLowering::ReplaceNodeResults( 5945 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5946 switch (N->getOpcode()) { 5947 default: 5948 report_fatal_error("Unhandled custom legalization"); 5949 case ISD::LOAD: 5950 ReplaceLoadVector(N, DAG, Results); 5951 return; 5952 case ISD::INTRINSIC_W_CHAIN: 5953 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5954 return; 5955 } 5956 } 5957 5958 NVPTXTargetLowering::AtomicExpansionKind 5959 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5960 Type *Ty = AI->getValOperand()->getType(); 5961 5962 if (AI->isFloatingPointOperation()) { 5963 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5964 if (Ty->isFloatTy()) 5965 return AtomicExpansionKind::None; 5966 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5967 return AtomicExpansionKind::None; 5968 } 5969 return AtomicExpansionKind::CmpXChg; 5970 } 5971 5972 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 5973 auto ITy = cast<llvm::IntegerType>(Ty); 5974 5975 switch (AI->getOperation()) { 5976 default: 5977 return AtomicExpansionKind::CmpXChg; 5978 case AtomicRMWInst::BinOp::And: 5979 case AtomicRMWInst::BinOp::Or: 5980 case AtomicRMWInst::BinOp::Xor: 5981 case AtomicRMWInst::BinOp::Xchg: 5982 switch (ITy->getBitWidth()) { 5983 case 8: 5984 case 16: 5985 return AtomicExpansionKind::CmpXChg; 5986 case 32: 5987 return AtomicExpansionKind::None; 5988 case 64: 5989 if (STI.hasAtomBitwise64()) 5990 return AtomicExpansionKind::None; 5991 return AtomicExpansionKind::CmpXChg; 5992 default: 5993 llvm_unreachable("unsupported width encountered"); 5994 } 5995 case AtomicRMWInst::BinOp::Add: 5996 case AtomicRMWInst::BinOp::Sub: 5997 case AtomicRMWInst::BinOp::Max: 5998 case AtomicRMWInst::BinOp::Min: 5999 case AtomicRMWInst::BinOp::UMax: 6000 case AtomicRMWInst::BinOp::UMin: 6001 switch (ITy->getBitWidth()) { 6002 case 8: 6003 case 16: 6004 return AtomicExpansionKind::CmpXChg; 6005 case 32: 6006 return AtomicExpansionKind::None; 6007 case 64: 6008 if (STI.hasAtomMinMax64()) 6009 return AtomicExpansionKind::None; 6010 return AtomicExpansionKind::CmpXChg; 6011 default: 6012 llvm_unreachable("unsupported width encountered"); 6013 } 6014 } 6015 6016 return AtomicExpansionKind::CmpXChg; 6017 } 6018 6019 // Pin NVPTXTargetObjectFile's vtables to this file. 6020 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 6021 6022 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 6023 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 6024 return getDataSection(); 6025 } 6026