1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/ISDOpcodes.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineMemOperand.h" 29 #include "llvm/CodeGen/MachineValueType.h" 30 #include "llvm/CodeGen/SelectionDAG.h" 31 #include "llvm/CodeGen/SelectionDAGNodes.h" 32 #include "llvm/CodeGen/TargetCallingConv.h" 33 #include "llvm/CodeGen/TargetLowering.h" 34 #include "llvm/CodeGen/ValueTypes.h" 35 #include "llvm/IR/Argument.h" 36 #include "llvm/IR/Attributes.h" 37 #include "llvm/IR/Constants.h" 38 #include "llvm/IR/DataLayout.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/DiagnosticInfo.h" 41 #include "llvm/IR/FPEnv.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalValue.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsNVPTX.h" 47 #include "llvm/IR/Module.h" 48 #include "llvm/IR/Type.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CodeGen.h" 52 #include "llvm/Support/CommandLine.h" 53 #include "llvm/Support/ErrorHandling.h" 54 #include "llvm/Support/raw_ostream.h" 55 #include "llvm/Target/TargetMachine.h" 56 #include "llvm/Target/TargetOptions.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <cmath> 60 #include <cstdint> 61 #include <iterator> 62 #include <sstream> 63 #include <string> 64 #include <utility> 65 #include <vector> 66 67 #define DEBUG_TYPE "nvptx-lower" 68 69 using namespace llvm; 70 71 static std::atomic<unsigned> GlobalUniqueCallSite; 72 73 static cl::opt<bool> sched4reg( 74 "nvptx-sched4reg", 75 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 76 77 static cl::opt<unsigned> FMAContractLevelOpt( 78 "nvptx-fma-level", cl::Hidden, 79 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 80 " 1: do it 2: do it aggressively"), 81 cl::init(2)); 82 83 static cl::opt<int> UsePrecDivF32( 84 "nvptx-prec-divf32", cl::Hidden, 85 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 86 " IEEE Compliant F32 div.rnd if available."), 87 cl::init(2)); 88 89 static cl::opt<bool> UsePrecSqrtF32( 90 "nvptx-prec-sqrtf32", cl::Hidden, 91 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 92 cl::init(true)); 93 94 static cl::opt<bool> ForceMinByValParamAlign( 95 "nvptx-force-min-byval-param-align", cl::Hidden, 96 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 97 " params of device functions."), 98 cl::init(false)); 99 100 int NVPTXTargetLowering::getDivF32Level() const { 101 if (UsePrecDivF32.getNumOccurrences() > 0) { 102 // If nvptx-prec-div32=N is used on the command-line, always honor it 103 return UsePrecDivF32; 104 } else { 105 // Otherwise, use div.approx if fast math is enabled 106 if (getTargetMachine().Options.UnsafeFPMath) 107 return 0; 108 else 109 return 2; 110 } 111 } 112 113 bool NVPTXTargetLowering::usePrecSqrtF32() const { 114 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 115 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 116 return UsePrecSqrtF32; 117 } else { 118 // Otherwise, use sqrt.approx if fast math is enabled 119 return !getTargetMachine().Options.UnsafeFPMath; 120 } 121 } 122 123 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 124 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 125 DenormalMode::PreserveSign; 126 } 127 128 static bool IsPTXVectorType(MVT VT) { 129 switch (VT.SimpleTy) { 130 default: 131 return false; 132 case MVT::v2i1: 133 case MVT::v4i1: 134 case MVT::v2i8: 135 case MVT::v4i8: 136 case MVT::v2i16: 137 case MVT::v4i16: 138 case MVT::v8i16: // <4 x i16x2> 139 case MVT::v2i32: 140 case MVT::v4i32: 141 case MVT::v2i64: 142 case MVT::v2f16: 143 case MVT::v4f16: 144 case MVT::v8f16: // <4 x f16x2> 145 case MVT::v2bf16: 146 case MVT::v4bf16: 147 case MVT::v8bf16: // <4 x bf16x2> 148 case MVT::v2f32: 149 case MVT::v4f32: 150 case MVT::v2f64: 151 return true; 152 } 153 } 154 155 static bool Is16bitsType(MVT VT) { 156 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || 157 VT.SimpleTy == MVT::i16); 158 } 159 160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 161 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 162 /// into their primitive components. 163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 165 /// LowerCall, and LowerReturn. 166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 167 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 168 SmallVectorImpl<uint64_t> *Offsets = nullptr, 169 uint64_t StartingOffset = 0) { 170 SmallVector<EVT, 16> TempVTs; 171 SmallVector<uint64_t, 16> TempOffsets; 172 173 // Special case for i128 - decompose to (i64, i64) 174 if (Ty->isIntegerTy(128)) { 175 ValueVTs.push_back(EVT(MVT::i64)); 176 ValueVTs.push_back(EVT(MVT::i64)); 177 178 if (Offsets) { 179 Offsets->push_back(StartingOffset + 0); 180 Offsets->push_back(StartingOffset + 8); 181 } 182 183 return; 184 } 185 186 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 187 if (StructType *STy = dyn_cast<StructType>(Ty)) { 188 auto const *SL = DL.getStructLayout(STy); 189 auto ElementNum = 0; 190 for(auto *EI : STy->elements()) { 191 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 192 StartingOffset + SL->getElementOffset(ElementNum)); 193 ++ElementNum; 194 } 195 return; 196 } 197 198 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 199 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 200 EVT VT = TempVTs[i]; 201 uint64_t Off = TempOffsets[i]; 202 // Split vectors into individual elements, except for v2f16, which 203 // we will pass as a single scalar. 204 if (VT.isVector()) { 205 unsigned NumElts = VT.getVectorNumElements(); 206 EVT EltVT = VT.getVectorElementType(); 207 // Vectors with an even number of f16 elements will be passed to 208 // us as an array of v2f16/v2bf16 elements. We must match this so we 209 // stay in sync with Ins/Outs. 210 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 211 switch (EltVT.getSimpleVT().SimpleTy) { 212 case MVT::f16: 213 EltVT = MVT::v2f16; 214 break; 215 case MVT::bf16: 216 EltVT = MVT::v2bf16; 217 break; 218 case MVT::i16: 219 EltVT = MVT::v2i16; 220 break; 221 default: 222 llvm_unreachable("Unexpected type"); 223 } 224 NumElts /= 2; 225 } else if (EltVT.getSimpleVT() == MVT::i8 && 226 (NumElts % 4 == 0 || NumElts == 3)) { 227 // v*i8 are formally lowered as v4i8 228 EltVT = MVT::v4i8; 229 NumElts = (NumElts + 3) / 4; 230 } 231 for (unsigned j = 0; j != NumElts; ++j) { 232 ValueVTs.push_back(EltVT); 233 if (Offsets) 234 Offsets->push_back(Off + j * EltVT.getStoreSize()); 235 } 236 } else { 237 ValueVTs.push_back(VT); 238 if (Offsets) 239 Offsets->push_back(Off); 240 } 241 } 242 } 243 244 /// PromoteScalarIntegerPTX 245 /// Used to make sure the arguments/returns are suitable for passing 246 /// and promote them to a larger size if they're not. 247 /// 248 /// The promoted type is placed in \p PromoteVT if the function returns true. 249 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 250 if (VT.isScalarInteger()) { 251 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 252 default: 253 llvm_unreachable( 254 "Promotion is not suitable for scalars of size larger than 64-bits"); 255 case 1: 256 *PromotedVT = MVT::i1; 257 break; 258 case 2: 259 case 4: 260 case 8: 261 *PromotedVT = MVT::i8; 262 break; 263 case 16: 264 *PromotedVT = MVT::i16; 265 break; 266 case 32: 267 *PromotedVT = MVT::i32; 268 break; 269 case 64: 270 *PromotedVT = MVT::i64; 271 break; 272 } 273 return EVT(*PromotedVT) != VT; 274 } 275 return false; 276 } 277 278 // Check whether we can merge loads/stores of some of the pieces of a 279 // flattened function parameter or return value into a single vector 280 // load/store. 281 // 282 // The flattened parameter is represented as a list of EVTs and 283 // offsets, and the whole structure is aligned to ParamAlignment. This 284 // function determines whether we can load/store pieces of the 285 // parameter starting at index Idx using a single vectorized op of 286 // size AccessSize. If so, it returns the number of param pieces 287 // covered by the vector op. Otherwise, it returns 1. 288 static unsigned CanMergeParamLoadStoresStartingAt( 289 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 290 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 291 292 // Can't vectorize if param alignment is not sufficient. 293 if (ParamAlignment < AccessSize) 294 return 1; 295 // Can't vectorize if offset is not aligned. 296 if (Offsets[Idx] & (AccessSize - 1)) 297 return 1; 298 299 EVT EltVT = ValueVTs[Idx]; 300 unsigned EltSize = EltVT.getStoreSize(); 301 302 // Element is too large to vectorize. 303 if (EltSize >= AccessSize) 304 return 1; 305 306 unsigned NumElts = AccessSize / EltSize; 307 // Can't vectorize if AccessBytes if not a multiple of EltSize. 308 if (AccessSize != EltSize * NumElts) 309 return 1; 310 311 // We don't have enough elements to vectorize. 312 if (Idx + NumElts > ValueVTs.size()) 313 return 1; 314 315 // PTX ISA can only deal with 2- and 4-element vector ops. 316 if (NumElts != 4 && NumElts != 2) 317 return 1; 318 319 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 320 // Types do not match. 321 if (ValueVTs[j] != EltVT) 322 return 1; 323 324 // Elements are not contiguous. 325 if (Offsets[j] - Offsets[j - 1] != EltSize) 326 return 1; 327 } 328 // OK. We can vectorize ValueVTs[i..i+NumElts) 329 return NumElts; 330 } 331 332 // Flags for tracking per-element vectorization state of loads/stores 333 // of a flattened function parameter or return value. 334 enum ParamVectorizationFlags { 335 PVF_INNER = 0x0, // Middle elements of a vector. 336 PVF_FIRST = 0x1, // First element of the vector. 337 PVF_LAST = 0x2, // Last element of the vector. 338 // Scalar is effectively a 1-element vector. 339 PVF_SCALAR = PVF_FIRST | PVF_LAST 340 }; 341 342 // Computes whether and how we can vectorize the loads/stores of a 343 // flattened function parameter or return value. 344 // 345 // The flattened parameter is represented as the list of ValueVTs and 346 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 347 // of the same size as ValueVTs indicating how each piece should be 348 // loaded/stored (i.e. as a scalar, or as part of a vector 349 // load/store). 350 static SmallVector<ParamVectorizationFlags, 16> 351 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 352 const SmallVectorImpl<uint64_t> &Offsets, 353 Align ParamAlignment, bool IsVAArg = false) { 354 // Set vector size to match ValueVTs and mark all elements as 355 // scalars by default. 356 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 357 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 358 359 if (IsVAArg) 360 return VectorInfo; 361 362 // Check what we can vectorize using 128/64/32-bit accesses. 363 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 364 // Skip elements we've already processed. 365 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 366 for (unsigned AccessSize : {16, 8, 4, 2}) { 367 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 368 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 369 // Mark vectorized elements. 370 switch (NumElts) { 371 default: 372 llvm_unreachable("Unexpected return value"); 373 case 1: 374 // Can't vectorize using this size, try next smaller size. 375 continue; 376 case 2: 377 assert(I + 1 < E && "Not enough elements."); 378 VectorInfo[I] = PVF_FIRST; 379 VectorInfo[I + 1] = PVF_LAST; 380 I += 1; 381 break; 382 case 4: 383 assert(I + 3 < E && "Not enough elements."); 384 VectorInfo[I] = PVF_FIRST; 385 VectorInfo[I + 1] = PVF_INNER; 386 VectorInfo[I + 2] = PVF_INNER; 387 VectorInfo[I + 3] = PVF_LAST; 388 I += 3; 389 break; 390 } 391 // Break out of the inner loop because we've already succeeded 392 // using largest possible AccessSize. 393 break; 394 } 395 } 396 return VectorInfo; 397 } 398 399 // NVPTXTargetLowering Constructor. 400 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 401 const NVPTXSubtarget &STI) 402 : TargetLowering(TM), nvTM(&TM), STI(STI) { 403 // always lower memset, memcpy, and memmove intrinsics to load/store 404 // instructions, rather 405 // then generating calls to memset, mempcy or memmove. 406 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; 407 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; 408 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; 409 410 setBooleanContents(ZeroOrNegativeOneBooleanContent); 411 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 412 413 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 414 // condition branches. 415 setJumpIsExpensive(true); 416 417 // Wide divides are _very_ slow. Try to reduce the width of the divide if 418 // possible. 419 addBypassSlowDiv(64, 32); 420 421 // By default, use the Source scheduling 422 if (sched4reg) 423 setSchedulingPreference(Sched::RegPressure); 424 else 425 setSchedulingPreference(Sched::Source); 426 427 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 428 LegalizeAction NoF16Action) { 429 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 430 }; 431 432 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 433 LegalizeAction NoBF16Action) { 434 bool IsOpSupported = STI.hasBF16Math(); 435 // Few instructions are available on sm_90 only 436 switch(Op) { 437 case ISD::FADD: 438 case ISD::FMUL: 439 case ISD::FSUB: 440 case ISD::SELECT: 441 case ISD::SELECT_CC: 442 case ISD::SETCC: 443 case ISD::FEXP2: 444 case ISD::FCEIL: 445 case ISD::FFLOOR: 446 case ISD::FNEARBYINT: 447 case ISD::FRINT: 448 case ISD::FTRUNC: 449 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 450 break; 451 } 452 setOperationAction( 453 Op, VT, IsOpSupported ? Action : NoBF16Action); 454 }; 455 456 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 457 LegalizeAction NoI16x2Action) { 458 bool IsOpSupported = false; 459 // instructions are available on sm_90 only 460 switch (Op) { 461 case ISD::ADD: 462 case ISD::SMAX: 463 case ISD::SMIN: 464 case ISD::UMIN: 465 case ISD::UMAX: 466 case ISD::SUB: 467 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; 468 break; 469 } 470 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); 471 }; 472 473 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 474 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 475 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); 476 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); 477 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 478 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 479 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 480 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 481 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 482 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 483 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 484 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 485 486 // Conversion to/from FP16/FP16x2 is always legal. 487 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 489 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 490 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 491 492 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 493 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 494 495 // Conversion to/from BFP16/BFP16x2 is always legal. 496 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 497 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 498 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 500 501 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 502 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 503 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) 504 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); 505 506 // Conversion to/from i16/i16x2 is always legal. 507 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); 508 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 509 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); 510 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); 511 512 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); 513 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); 514 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); 515 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); 516 // Only logical ops can be done on v4i8 directly, others must be done 517 // elementwise. 518 setOperationAction( 519 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, 520 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, 521 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, 522 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, 523 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, 524 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, 525 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, 526 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, 527 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, 528 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, 529 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, 530 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, 531 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, 532 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, 533 ISD::USUBSAT}, 534 MVT::v4i8, Expand); 535 536 // Operations not directly supported by NVPTX. 537 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 538 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, 539 MVT::i32, MVT::i64}) { 540 setOperationAction(ISD::SELECT_CC, VT, Expand); 541 setOperationAction(ISD::BR_CC, VT, Expand); 542 } 543 544 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 545 // For others we will expand to a SHL/SRA pair. 546 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 547 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 548 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 549 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 550 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 551 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 552 553 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 554 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 555 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 556 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 557 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 558 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 559 560 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 561 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 562 563 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 564 // that don't have h/w rotation we lower them to multi-instruction assembly. 565 // See ROT*_sw in NVPTXIntrInfo.td 566 setOperationAction(ISD::ROTL, MVT::i64, Legal); 567 setOperationAction(ISD::ROTR, MVT::i64, Legal); 568 setOperationAction(ISD::ROTL, MVT::i32, Legal); 569 setOperationAction(ISD::ROTR, MVT::i32, Legal); 570 571 setOperationAction(ISD::ROTL, MVT::i16, Expand); 572 setOperationAction(ISD::ROTL, MVT::v2i16, Expand); 573 setOperationAction(ISD::ROTR, MVT::i16, Expand); 574 setOperationAction(ISD::ROTR, MVT::v2i16, Expand); 575 setOperationAction(ISD::ROTL, MVT::i8, Expand); 576 setOperationAction(ISD::ROTR, MVT::i8, Expand); 577 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 578 setOperationAction(ISD::BSWAP, MVT::v2i16, Expand); 579 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 580 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 581 582 // Indirect branch is not supported. 583 // This also disables Jump Table creation. 584 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 585 setOperationAction(ISD::BRIND, MVT::Other, Expand); 586 587 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 588 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 589 590 // We want to legalize constant related memmove and memcopy 591 // intrinsics. 592 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 593 594 // Turn FP extload into load/fpextend 595 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 596 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 597 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 598 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 600 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 601 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 602 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 605 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 606 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 607 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 610 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 611 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 612 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 614 // Turn FP truncstore into trunc + store. 615 // FIXME: vector types should also be expanded 616 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 617 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 618 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 619 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 620 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 621 622 // PTX does not support load / store predicate registers 623 setOperationAction(ISD::LOAD, MVT::i1, Custom); 624 setOperationAction(ISD::STORE, MVT::i1, Custom); 625 626 for (MVT VT : MVT::integer_valuetypes()) { 627 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 628 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 629 setTruncStoreAction(VT, MVT::i1, Expand); 630 } 631 632 // expand extload of vector of integers. 633 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, 634 MVT::v2i8, Expand); 635 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 636 637 // This is legal in NVPTX 638 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 639 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 640 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 641 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 642 643 // Lowering of DYNAMIC_STACKALLOC is unsupported. 644 // Custom lower to produce an error. 645 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 646 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 647 648 // TRAP can be lowered to PTX trap 649 setOperationAction(ISD::TRAP, MVT::Other, Legal); 650 651 // Register custom handling for vector loads/stores 652 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 653 if (IsPTXVectorType(VT)) { 654 setOperationAction(ISD::LOAD, VT, Custom); 655 setOperationAction(ISD::STORE, VT, Custom); 656 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 657 } 658 } 659 660 // Support varargs. 661 setOperationAction(ISD::VASTART, MVT::Other, Custom); 662 setOperationAction(ISD::VAARG, MVT::Other, Custom); 663 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 664 setOperationAction(ISD::VAEND, MVT::Other, Expand); 665 666 // Custom handling for i8 intrinsics 667 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 668 669 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 670 setOperationAction(ISD::ABS, Ty, Legal); 671 setOperationAction(ISD::SMIN, Ty, Legal); 672 setOperationAction(ISD::SMAX, Ty, Legal); 673 setOperationAction(ISD::UMIN, Ty, Legal); 674 setOperationAction(ISD::UMAX, Ty, Legal); 675 676 setOperationAction(ISD::CTPOP, Ty, Legal); 677 setOperationAction(ISD::CTLZ, Ty, Legal); 678 } 679 680 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); 681 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); 682 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); 683 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); 684 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); 685 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); 686 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); 687 688 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); 689 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); 690 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); 691 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); 692 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); 693 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); 694 695 // Other arithmetic and logic ops are unsupported. 696 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, 697 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, 698 ISD::SINT_TO_FP, ISD::UINT_TO_FP}, 699 MVT::v2i16, Expand); 700 701 setOperationAction(ISD::ADDC, MVT::i32, Legal); 702 setOperationAction(ISD::ADDE, MVT::i32, Legal); 703 setOperationAction(ISD::SUBC, MVT::i32, Legal); 704 setOperationAction(ISD::SUBE, MVT::i32, Legal); 705 if (STI.getPTXVersion() >= 43) { 706 setOperationAction(ISD::ADDC, MVT::i64, Legal); 707 setOperationAction(ISD::ADDE, MVT::i64, Legal); 708 setOperationAction(ISD::SUBC, MVT::i64, Legal); 709 setOperationAction(ISD::SUBE, MVT::i64, Legal); 710 } 711 712 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 713 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); 714 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 715 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 716 717 // PTX does not directly support SELP of i1, so promote to i32 first 718 setOperationAction(ISD::SELECT, MVT::i1, Custom); 719 720 // PTX cannot multiply two i64s in a single instruction. 721 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 722 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 723 724 // We have some custom DAG combine patterns for these nodes 725 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, 726 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, 727 ISD::VSELECT}); 728 729 // setcc for f16x2 and bf16x2 needs special handling to prevent 730 // legalizer's attempt to scalarize it due to v2i1 not being legal. 731 if (STI.allowFP16Math() || STI.hasBF16Math()) 732 setTargetDAGCombine(ISD::SETCC); 733 734 // Promote fp16 arithmetic if fp16 hardware isn't available or the 735 // user passed --nvptx-no-fp16-math. The flag is useful because, 736 // although sm_53+ GPUs have some sort of FP16 support in 737 // hardware, only sm_53 and sm_60 have full implementation. Others 738 // only have token amount of hardware and are likely to run faster 739 // by using fp32 units instead. 740 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 741 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 742 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 743 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 744 // bf16 must be promoted to f32. 745 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 746 if (getOperationAction(Op, MVT::bf16) == Promote) 747 AddPromotedToType(Op, MVT::bf16, MVT::f32); 748 } 749 750 // f16/f16x2 neg was introduced in PTX 60, SM_53. 751 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 752 STI.getPTXVersion() >= 60 && 753 STI.allowFP16Math(); 754 for (const auto &VT : {MVT::f16, MVT::v2f16}) 755 setOperationAction(ISD::FNEG, VT, 756 IsFP16FP16x2NegAvailable ? Legal : Expand); 757 758 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 759 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 760 // (would be) Library functions. 761 762 // These map to conversion instructions for scalar FP types. 763 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 764 ISD::FROUNDEVEN, ISD::FTRUNC}) { 765 setOperationAction(Op, MVT::f16, Legal); 766 setOperationAction(Op, MVT::f32, Legal); 767 setOperationAction(Op, MVT::f64, Legal); 768 setOperationAction(Op, MVT::v2f16, Expand); 769 setOperationAction(Op, MVT::v2bf16, Expand); 770 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 771 if (getOperationAction(Op, MVT::bf16) == Promote) 772 AddPromotedToType(Op, MVT::bf16, MVT::f32); 773 } 774 775 // sm_80 only has conversions between f32 and bf16. Custom lower all other 776 // bf16 conversions. 777 if (STI.hasBF16Math() && 778 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { 779 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { 780 setOperationAction( 781 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 782 VT, Custom); 783 } 784 } 785 786 setOperationAction(ISD::FROUND, MVT::f16, Promote); 787 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 788 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 789 setOperationAction(ISD::FROUND, MVT::f32, Custom); 790 setOperationAction(ISD::FROUND, MVT::f64, Custom); 791 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 792 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); 793 794 // 'Expand' implements FCOPYSIGN without calling an external library. 795 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 796 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 797 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 798 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 799 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 800 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 801 802 // These map to corresponding instructions for f32/f64. f16 must be 803 // promoted to f32. v2f16 is expanded to f16, which is then promoted 804 // to f32. 805 for (const auto &Op : 806 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { 807 setOperationAction(Op, MVT::f16, Promote); 808 setOperationAction(Op, MVT::f32, Legal); 809 setOperationAction(Op, MVT::f64, Legal); 810 setOperationAction(Op, MVT::v2f16, Expand); 811 setOperationAction(Op, MVT::v2bf16, Expand); 812 setOperationAction(Op, MVT::bf16, Promote); 813 AddPromotedToType(Op, MVT::bf16, MVT::f32); 814 } 815 for (const auto &Op : {ISD::FABS}) { 816 setOperationAction(Op, MVT::f16, Promote); 817 setOperationAction(Op, MVT::f32, Legal); 818 setOperationAction(Op, MVT::f64, Legal); 819 setOperationAction(Op, MVT::v2f16, Expand); 820 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 821 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 822 if (getOperationAction(Op, MVT::bf16) == Promote) 823 AddPromotedToType(Op, MVT::bf16, MVT::f32); 824 } 825 826 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 827 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 828 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 829 return IsAtLeastSm80 ? Legal : NotSm80Action; 830 }; 831 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 832 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 833 setOperationAction(Op, MVT::f32, Legal); 834 setOperationAction(Op, MVT::f64, Legal); 835 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 836 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 837 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 838 if (getOperationAction(Op, MVT::bf16) == Promote) 839 AddPromotedToType(Op, MVT::bf16, MVT::f32); 840 } 841 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 842 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 843 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 844 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 845 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 846 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 847 } 848 849 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 850 // No FPOW or FREM in PTX. 851 852 // Now deduce the information based on the above mentioned 853 // actions 854 computeRegisterProperties(STI.getRegisterInfo()); 855 856 setMinCmpXchgSizeInBits(32); 857 setMaxAtomicSizeInBitsSupported(64); 858 } 859 860 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 861 switch ((NVPTXISD::NodeType)Opcode) { 862 case NVPTXISD::FIRST_NUMBER: 863 break; 864 case NVPTXISD::CALL: 865 return "NVPTXISD::CALL"; 866 case NVPTXISD::RET_GLUE: 867 return "NVPTXISD::RET_GLUE"; 868 case NVPTXISD::LOAD_PARAM: 869 return "NVPTXISD::LOAD_PARAM"; 870 case NVPTXISD::Wrapper: 871 return "NVPTXISD::Wrapper"; 872 case NVPTXISD::DeclareParam: 873 return "NVPTXISD::DeclareParam"; 874 case NVPTXISD::DeclareScalarParam: 875 return "NVPTXISD::DeclareScalarParam"; 876 case NVPTXISD::DeclareRet: 877 return "NVPTXISD::DeclareRet"; 878 case NVPTXISD::DeclareScalarRet: 879 return "NVPTXISD::DeclareScalarRet"; 880 case NVPTXISD::DeclareRetParam: 881 return "NVPTXISD::DeclareRetParam"; 882 case NVPTXISD::PrintCall: 883 return "NVPTXISD::PrintCall"; 884 case NVPTXISD::PrintConvergentCall: 885 return "NVPTXISD::PrintConvergentCall"; 886 case NVPTXISD::PrintCallUni: 887 return "NVPTXISD::PrintCallUni"; 888 case NVPTXISD::PrintConvergentCallUni: 889 return "NVPTXISD::PrintConvergentCallUni"; 890 case NVPTXISD::LoadParam: 891 return "NVPTXISD::LoadParam"; 892 case NVPTXISD::LoadParamV2: 893 return "NVPTXISD::LoadParamV2"; 894 case NVPTXISD::LoadParamV4: 895 return "NVPTXISD::LoadParamV4"; 896 case NVPTXISD::StoreParam: 897 return "NVPTXISD::StoreParam"; 898 case NVPTXISD::StoreParamV2: 899 return "NVPTXISD::StoreParamV2"; 900 case NVPTXISD::StoreParamV4: 901 return "NVPTXISD::StoreParamV4"; 902 case NVPTXISD::StoreParamS32: 903 return "NVPTXISD::StoreParamS32"; 904 case NVPTXISD::StoreParamU32: 905 return "NVPTXISD::StoreParamU32"; 906 case NVPTXISD::CallArgBegin: 907 return "NVPTXISD::CallArgBegin"; 908 case NVPTXISD::CallArg: 909 return "NVPTXISD::CallArg"; 910 case NVPTXISD::LastCallArg: 911 return "NVPTXISD::LastCallArg"; 912 case NVPTXISD::CallArgEnd: 913 return "NVPTXISD::CallArgEnd"; 914 case NVPTXISD::CallVoid: 915 return "NVPTXISD::CallVoid"; 916 case NVPTXISD::CallVal: 917 return "NVPTXISD::CallVal"; 918 case NVPTXISD::CallSymbol: 919 return "NVPTXISD::CallSymbol"; 920 case NVPTXISD::Prototype: 921 return "NVPTXISD::Prototype"; 922 case NVPTXISD::MoveParam: 923 return "NVPTXISD::MoveParam"; 924 case NVPTXISD::StoreRetval: 925 return "NVPTXISD::StoreRetval"; 926 case NVPTXISD::StoreRetvalV2: 927 return "NVPTXISD::StoreRetvalV2"; 928 case NVPTXISD::StoreRetvalV4: 929 return "NVPTXISD::StoreRetvalV4"; 930 case NVPTXISD::PseudoUseParam: 931 return "NVPTXISD::PseudoUseParam"; 932 case NVPTXISD::RETURN: 933 return "NVPTXISD::RETURN"; 934 case NVPTXISD::CallSeqBegin: 935 return "NVPTXISD::CallSeqBegin"; 936 case NVPTXISD::CallSeqEnd: 937 return "NVPTXISD::CallSeqEnd"; 938 case NVPTXISD::CallPrototype: 939 return "NVPTXISD::CallPrototype"; 940 case NVPTXISD::ProxyReg: 941 return "NVPTXISD::ProxyReg"; 942 case NVPTXISD::LoadV2: 943 return "NVPTXISD::LoadV2"; 944 case NVPTXISD::LoadV4: 945 return "NVPTXISD::LoadV4"; 946 case NVPTXISD::LDGV2: 947 return "NVPTXISD::LDGV2"; 948 case NVPTXISD::LDGV4: 949 return "NVPTXISD::LDGV4"; 950 case NVPTXISD::LDUV2: 951 return "NVPTXISD::LDUV2"; 952 case NVPTXISD::LDUV4: 953 return "NVPTXISD::LDUV4"; 954 case NVPTXISD::StoreV2: 955 return "NVPTXISD::StoreV2"; 956 case NVPTXISD::StoreV4: 957 return "NVPTXISD::StoreV4"; 958 case NVPTXISD::FUN_SHFL_CLAMP: 959 return "NVPTXISD::FUN_SHFL_CLAMP"; 960 case NVPTXISD::FUN_SHFR_CLAMP: 961 return "NVPTXISD::FUN_SHFR_CLAMP"; 962 case NVPTXISD::IMAD: 963 return "NVPTXISD::IMAD"; 964 case NVPTXISD::BFE: 965 return "NVPTXISD::BFE"; 966 case NVPTXISD::BFI: 967 return "NVPTXISD::BFI"; 968 case NVPTXISD::PRMT: 969 return "NVPTXISD::PRMT"; 970 case NVPTXISD::SETP_F16X2: 971 return "NVPTXISD::SETP_F16X2"; 972 case NVPTXISD::SETP_BF16X2: 973 return "NVPTXISD::SETP_BF16X2"; 974 case NVPTXISD::Dummy: 975 return "NVPTXISD::Dummy"; 976 case NVPTXISD::MUL_WIDE_SIGNED: 977 return "NVPTXISD::MUL_WIDE_SIGNED"; 978 case NVPTXISD::MUL_WIDE_UNSIGNED: 979 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 980 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 981 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 982 case NVPTXISD::Tex1DFloatFloatLevel: 983 return "NVPTXISD::Tex1DFloatFloatLevel"; 984 case NVPTXISD::Tex1DFloatFloatGrad: 985 return "NVPTXISD::Tex1DFloatFloatGrad"; 986 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 987 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 988 case NVPTXISD::Tex1DS32FloatLevel: 989 return "NVPTXISD::Tex1DS32FloatLevel"; 990 case NVPTXISD::Tex1DS32FloatGrad: 991 return "NVPTXISD::Tex1DS32FloatGrad"; 992 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 993 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 994 case NVPTXISD::Tex1DU32FloatLevel: 995 return "NVPTXISD::Tex1DU32FloatLevel"; 996 case NVPTXISD::Tex1DU32FloatGrad: 997 return "NVPTXISD::Tex1DU32FloatGrad"; 998 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 999 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 1000 case NVPTXISD::Tex1DArrayFloatFloatLevel: 1001 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 1002 case NVPTXISD::Tex1DArrayFloatFloatGrad: 1003 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 1004 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 1005 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 1006 case NVPTXISD::Tex1DArrayS32FloatLevel: 1007 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 1008 case NVPTXISD::Tex1DArrayS32FloatGrad: 1009 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 1010 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 1011 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 1012 case NVPTXISD::Tex1DArrayU32FloatLevel: 1013 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 1014 case NVPTXISD::Tex1DArrayU32FloatGrad: 1015 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 1016 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 1017 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 1018 case NVPTXISD::Tex2DFloatFloatLevel: 1019 return "NVPTXISD::Tex2DFloatFloatLevel"; 1020 case NVPTXISD::Tex2DFloatFloatGrad: 1021 return "NVPTXISD::Tex2DFloatFloatGrad"; 1022 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 1023 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 1024 case NVPTXISD::Tex2DS32FloatLevel: 1025 return "NVPTXISD::Tex2DS32FloatLevel"; 1026 case NVPTXISD::Tex2DS32FloatGrad: 1027 return "NVPTXISD::Tex2DS32FloatGrad"; 1028 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 1029 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 1030 case NVPTXISD::Tex2DU32FloatLevel: 1031 return "NVPTXISD::Tex2DU32FloatLevel"; 1032 case NVPTXISD::Tex2DU32FloatGrad: 1033 return "NVPTXISD::Tex2DU32FloatGrad"; 1034 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 1035 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 1036 case NVPTXISD::Tex2DArrayFloatFloatLevel: 1037 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 1038 case NVPTXISD::Tex2DArrayFloatFloatGrad: 1039 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 1040 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 1041 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 1042 case NVPTXISD::Tex2DArrayS32FloatLevel: 1043 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 1044 case NVPTXISD::Tex2DArrayS32FloatGrad: 1045 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 1046 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 1047 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 1048 case NVPTXISD::Tex2DArrayU32FloatLevel: 1049 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 1050 case NVPTXISD::Tex2DArrayU32FloatGrad: 1051 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 1052 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 1053 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 1054 case NVPTXISD::Tex3DFloatFloatLevel: 1055 return "NVPTXISD::Tex3DFloatFloatLevel"; 1056 case NVPTXISD::Tex3DFloatFloatGrad: 1057 return "NVPTXISD::Tex3DFloatFloatGrad"; 1058 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 1059 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 1060 case NVPTXISD::Tex3DS32FloatLevel: 1061 return "NVPTXISD::Tex3DS32FloatLevel"; 1062 case NVPTXISD::Tex3DS32FloatGrad: 1063 return "NVPTXISD::Tex3DS32FloatGrad"; 1064 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 1065 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 1066 case NVPTXISD::Tex3DU32FloatLevel: 1067 return "NVPTXISD::Tex3DU32FloatLevel"; 1068 case NVPTXISD::Tex3DU32FloatGrad: 1069 return "NVPTXISD::Tex3DU32FloatGrad"; 1070 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 1071 case NVPTXISD::TexCubeFloatFloatLevel: 1072 return "NVPTXISD::TexCubeFloatFloatLevel"; 1073 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 1074 case NVPTXISD::TexCubeS32FloatLevel: 1075 return "NVPTXISD::TexCubeS32FloatLevel"; 1076 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 1077 case NVPTXISD::TexCubeU32FloatLevel: 1078 return "NVPTXISD::TexCubeU32FloatLevel"; 1079 case NVPTXISD::TexCubeArrayFloatFloat: 1080 return "NVPTXISD::TexCubeArrayFloatFloat"; 1081 case NVPTXISD::TexCubeArrayFloatFloatLevel: 1082 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 1083 case NVPTXISD::TexCubeArrayS32Float: 1084 return "NVPTXISD::TexCubeArrayS32Float"; 1085 case NVPTXISD::TexCubeArrayS32FloatLevel: 1086 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 1087 case NVPTXISD::TexCubeArrayU32Float: 1088 return "NVPTXISD::TexCubeArrayU32Float"; 1089 case NVPTXISD::TexCubeArrayU32FloatLevel: 1090 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 1091 case NVPTXISD::Tld4R2DFloatFloat: 1092 return "NVPTXISD::Tld4R2DFloatFloat"; 1093 case NVPTXISD::Tld4G2DFloatFloat: 1094 return "NVPTXISD::Tld4G2DFloatFloat"; 1095 case NVPTXISD::Tld4B2DFloatFloat: 1096 return "NVPTXISD::Tld4B2DFloatFloat"; 1097 case NVPTXISD::Tld4A2DFloatFloat: 1098 return "NVPTXISD::Tld4A2DFloatFloat"; 1099 case NVPTXISD::Tld4R2DS64Float: 1100 return "NVPTXISD::Tld4R2DS64Float"; 1101 case NVPTXISD::Tld4G2DS64Float: 1102 return "NVPTXISD::Tld4G2DS64Float"; 1103 case NVPTXISD::Tld4B2DS64Float: 1104 return "NVPTXISD::Tld4B2DS64Float"; 1105 case NVPTXISD::Tld4A2DS64Float: 1106 return "NVPTXISD::Tld4A2DS64Float"; 1107 case NVPTXISD::Tld4R2DU64Float: 1108 return "NVPTXISD::Tld4R2DU64Float"; 1109 case NVPTXISD::Tld4G2DU64Float: 1110 return "NVPTXISD::Tld4G2DU64Float"; 1111 case NVPTXISD::Tld4B2DU64Float: 1112 return "NVPTXISD::Tld4B2DU64Float"; 1113 case NVPTXISD::Tld4A2DU64Float: 1114 return "NVPTXISD::Tld4A2DU64Float"; 1115 1116 case NVPTXISD::TexUnified1DFloatS32: 1117 return "NVPTXISD::TexUnified1DFloatS32"; 1118 case NVPTXISD::TexUnified1DFloatFloat: 1119 return "NVPTXISD::TexUnified1DFloatFloat"; 1120 case NVPTXISD::TexUnified1DFloatFloatLevel: 1121 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 1122 case NVPTXISD::TexUnified1DFloatFloatGrad: 1123 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 1124 case NVPTXISD::TexUnified1DS32S32: 1125 return "NVPTXISD::TexUnified1DS32S32"; 1126 case NVPTXISD::TexUnified1DS32Float: 1127 return "NVPTXISD::TexUnified1DS32Float"; 1128 case NVPTXISD::TexUnified1DS32FloatLevel: 1129 return "NVPTXISD::TexUnified1DS32FloatLevel"; 1130 case NVPTXISD::TexUnified1DS32FloatGrad: 1131 return "NVPTXISD::TexUnified1DS32FloatGrad"; 1132 case NVPTXISD::TexUnified1DU32S32: 1133 return "NVPTXISD::TexUnified1DU32S32"; 1134 case NVPTXISD::TexUnified1DU32Float: 1135 return "NVPTXISD::TexUnified1DU32Float"; 1136 case NVPTXISD::TexUnified1DU32FloatLevel: 1137 return "NVPTXISD::TexUnified1DU32FloatLevel"; 1138 case NVPTXISD::TexUnified1DU32FloatGrad: 1139 return "NVPTXISD::TexUnified1DU32FloatGrad"; 1140 case NVPTXISD::TexUnified1DArrayFloatS32: 1141 return "NVPTXISD::TexUnified1DArrayFloatS32"; 1142 case NVPTXISD::TexUnified1DArrayFloatFloat: 1143 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 1144 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 1145 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 1146 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 1147 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 1148 case NVPTXISD::TexUnified1DArrayS32S32: 1149 return "NVPTXISD::TexUnified1DArrayS32S32"; 1150 case NVPTXISD::TexUnified1DArrayS32Float: 1151 return "NVPTXISD::TexUnified1DArrayS32Float"; 1152 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 1153 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 1154 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 1155 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 1156 case NVPTXISD::TexUnified1DArrayU32S32: 1157 return "NVPTXISD::TexUnified1DArrayU32S32"; 1158 case NVPTXISD::TexUnified1DArrayU32Float: 1159 return "NVPTXISD::TexUnified1DArrayU32Float"; 1160 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 1161 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 1162 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 1163 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 1164 case NVPTXISD::TexUnified2DFloatS32: 1165 return "NVPTXISD::TexUnified2DFloatS32"; 1166 case NVPTXISD::TexUnified2DFloatFloat: 1167 return "NVPTXISD::TexUnified2DFloatFloat"; 1168 case NVPTXISD::TexUnified2DFloatFloatLevel: 1169 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 1170 case NVPTXISD::TexUnified2DFloatFloatGrad: 1171 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 1172 case NVPTXISD::TexUnified2DS32S32: 1173 return "NVPTXISD::TexUnified2DS32S32"; 1174 case NVPTXISD::TexUnified2DS32Float: 1175 return "NVPTXISD::TexUnified2DS32Float"; 1176 case NVPTXISD::TexUnified2DS32FloatLevel: 1177 return "NVPTXISD::TexUnified2DS32FloatLevel"; 1178 case NVPTXISD::TexUnified2DS32FloatGrad: 1179 return "NVPTXISD::TexUnified2DS32FloatGrad"; 1180 case NVPTXISD::TexUnified2DU32S32: 1181 return "NVPTXISD::TexUnified2DU32S32"; 1182 case NVPTXISD::TexUnified2DU32Float: 1183 return "NVPTXISD::TexUnified2DU32Float"; 1184 case NVPTXISD::TexUnified2DU32FloatLevel: 1185 return "NVPTXISD::TexUnified2DU32FloatLevel"; 1186 case NVPTXISD::TexUnified2DU32FloatGrad: 1187 return "NVPTXISD::TexUnified2DU32FloatGrad"; 1188 case NVPTXISD::TexUnified2DArrayFloatS32: 1189 return "NVPTXISD::TexUnified2DArrayFloatS32"; 1190 case NVPTXISD::TexUnified2DArrayFloatFloat: 1191 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 1192 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 1193 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 1194 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 1195 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 1196 case NVPTXISD::TexUnified2DArrayS32S32: 1197 return "NVPTXISD::TexUnified2DArrayS32S32"; 1198 case NVPTXISD::TexUnified2DArrayS32Float: 1199 return "NVPTXISD::TexUnified2DArrayS32Float"; 1200 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 1201 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 1202 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 1203 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 1204 case NVPTXISD::TexUnified2DArrayU32S32: 1205 return "NVPTXISD::TexUnified2DArrayU32S32"; 1206 case NVPTXISD::TexUnified2DArrayU32Float: 1207 return "NVPTXISD::TexUnified2DArrayU32Float"; 1208 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 1209 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 1210 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 1211 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 1212 case NVPTXISD::TexUnified3DFloatS32: 1213 return "NVPTXISD::TexUnified3DFloatS32"; 1214 case NVPTXISD::TexUnified3DFloatFloat: 1215 return "NVPTXISD::TexUnified3DFloatFloat"; 1216 case NVPTXISD::TexUnified3DFloatFloatLevel: 1217 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 1218 case NVPTXISD::TexUnified3DFloatFloatGrad: 1219 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 1220 case NVPTXISD::TexUnified3DS32S32: 1221 return "NVPTXISD::TexUnified3DS32S32"; 1222 case NVPTXISD::TexUnified3DS32Float: 1223 return "NVPTXISD::TexUnified3DS32Float"; 1224 case NVPTXISD::TexUnified3DS32FloatLevel: 1225 return "NVPTXISD::TexUnified3DS32FloatLevel"; 1226 case NVPTXISD::TexUnified3DS32FloatGrad: 1227 return "NVPTXISD::TexUnified3DS32FloatGrad"; 1228 case NVPTXISD::TexUnified3DU32S32: 1229 return "NVPTXISD::TexUnified3DU32S32"; 1230 case NVPTXISD::TexUnified3DU32Float: 1231 return "NVPTXISD::TexUnified3DU32Float"; 1232 case NVPTXISD::TexUnified3DU32FloatLevel: 1233 return "NVPTXISD::TexUnified3DU32FloatLevel"; 1234 case NVPTXISD::TexUnified3DU32FloatGrad: 1235 return "NVPTXISD::TexUnified3DU32FloatGrad"; 1236 case NVPTXISD::TexUnifiedCubeFloatFloat: 1237 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1238 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1239 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1240 case NVPTXISD::TexUnifiedCubeS32Float: 1241 return "NVPTXISD::TexUnifiedCubeS32Float"; 1242 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1243 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1244 case NVPTXISD::TexUnifiedCubeU32Float: 1245 return "NVPTXISD::TexUnifiedCubeU32Float"; 1246 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1247 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1248 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1249 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1250 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1251 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1252 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1253 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1254 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1255 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1256 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1257 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1258 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1259 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1260 case NVPTXISD::TexUnifiedCubeFloatFloatGrad: 1261 return "NVPTXISD::TexUnifiedCubeFloatFloatGrad"; 1262 case NVPTXISD::TexUnifiedCubeS32FloatGrad: 1263 return "NVPTXISD::TexUnifiedCubeS32FloatGrad"; 1264 case NVPTXISD::TexUnifiedCubeU32FloatGrad: 1265 return "NVPTXISD::TexUnifiedCubeU32FloatGrad"; 1266 case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: 1267 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad"; 1268 case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: 1269 return "NVPTXISD::TexUnifiedCubeArrayS32FloatGrad"; 1270 case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: 1271 return "NVPTXISD::TexUnifiedCubeArrayU32FloatGrad"; 1272 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1273 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1274 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1275 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1276 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1277 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1278 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1279 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1280 case NVPTXISD::Tld4UnifiedR2DS64Float: 1281 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1282 case NVPTXISD::Tld4UnifiedG2DS64Float: 1283 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1284 case NVPTXISD::Tld4UnifiedB2DS64Float: 1285 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1286 case NVPTXISD::Tld4UnifiedA2DS64Float: 1287 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1288 case NVPTXISD::Tld4UnifiedR2DU64Float: 1289 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1290 case NVPTXISD::Tld4UnifiedG2DU64Float: 1291 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1292 case NVPTXISD::Tld4UnifiedB2DU64Float: 1293 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1294 case NVPTXISD::Tld4UnifiedA2DU64Float: 1295 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1296 1297 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1298 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1299 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1300 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1301 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1302 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1303 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1304 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1305 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1306 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1307 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1308 1309 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1310 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1311 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1312 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1313 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1314 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1315 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1316 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1317 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1318 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1319 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1320 1321 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1322 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1323 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1324 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1325 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1326 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1327 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1328 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1329 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1330 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1331 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1332 1333 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1334 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1335 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1336 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1337 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1338 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1339 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1340 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1341 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1342 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1343 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1344 1345 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1346 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1347 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1348 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1349 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1350 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1351 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1352 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1353 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1354 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1355 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1356 1357 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1358 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1359 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1360 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1361 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1362 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1363 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1364 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1365 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1366 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1367 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1368 1369 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1370 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1371 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1372 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1373 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1374 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1375 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1376 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1377 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1378 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1379 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1380 1381 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1382 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1383 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1384 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1385 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1386 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1387 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1388 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1389 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1390 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1391 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1392 1393 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1394 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1395 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1396 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1397 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1398 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1399 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1400 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1401 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1402 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1403 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1404 1405 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1406 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1407 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1408 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1409 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1410 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1411 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1412 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1413 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1414 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1415 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1416 1417 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1418 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1419 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1420 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1421 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1422 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1423 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1424 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1425 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1426 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1427 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1428 1429 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1430 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1431 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1432 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1433 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1434 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1435 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1436 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1437 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1438 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1439 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1440 1441 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1442 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1443 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1444 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1445 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1446 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1447 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1448 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1449 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1450 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1451 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1452 1453 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1454 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1455 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1456 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1457 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1458 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1459 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1460 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1461 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1462 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1463 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1464 1465 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1466 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1467 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1468 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1469 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1470 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1471 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1472 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1473 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1474 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1475 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1476 } 1477 return nullptr; 1478 } 1479 1480 TargetLoweringBase::LegalizeTypeAction 1481 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1482 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1483 VT.getScalarType() == MVT::i1) 1484 return TypeSplitVector; 1485 if (Isv2x16VT(VT)) 1486 return TypeLegal; 1487 return TargetLoweringBase::getPreferredVectorAction(VT); 1488 } 1489 1490 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1491 int Enabled, int &ExtraSteps, 1492 bool &UseOneConst, 1493 bool Reciprocal) const { 1494 if (!(Enabled == ReciprocalEstimate::Enabled || 1495 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1496 return SDValue(); 1497 1498 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1499 ExtraSteps = 0; 1500 1501 SDLoc DL(Operand); 1502 EVT VT = Operand.getValueType(); 1503 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1504 1505 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1506 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1507 DAG.getConstant(IID, DL, MVT::i32), Operand); 1508 }; 1509 1510 // The sqrt and rsqrt refinement processes assume we always start out with an 1511 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1512 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1513 // any refinement, we must return a regular sqrt. 1514 if (Reciprocal || ExtraSteps > 0) { 1515 if (VT == MVT::f32) 1516 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1517 : Intrinsic::nvvm_rsqrt_approx_f); 1518 else if (VT == MVT::f64) 1519 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1520 else 1521 return SDValue(); 1522 } else { 1523 if (VT == MVT::f32) 1524 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1525 : Intrinsic::nvvm_sqrt_approx_f); 1526 else { 1527 // There's no sqrt.approx.f64 instruction, so we emit 1528 // reciprocal(rsqrt(x)). This is faster than 1529 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1530 // x * rsqrt(x).) 1531 return DAG.getNode( 1532 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1533 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1534 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1535 } 1536 } 1537 } 1538 1539 SDValue 1540 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1541 SDLoc dl(Op); 1542 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1543 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1544 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1545 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1546 } 1547 1548 static bool IsTypePassedAsArray(const Type *Ty) { 1549 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1550 Ty->isHalfTy() || Ty->isBFloatTy(); 1551 } 1552 1553 std::string NVPTXTargetLowering::getPrototype( 1554 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1555 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1556 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1557 const CallBase &CB, unsigned UniqueCallSite) const { 1558 auto PtrVT = getPointerTy(DL); 1559 1560 bool isABI = (STI.getSmVersion() >= 20); 1561 assert(isABI && "Non-ABI compilation is not supported"); 1562 if (!isABI) 1563 return ""; 1564 1565 std::string Prototype; 1566 raw_string_ostream O(Prototype); 1567 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1568 1569 if (retTy->getTypeID() == Type::VoidTyID) { 1570 O << "()"; 1571 } else { 1572 O << "("; 1573 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1574 !IsTypePassedAsArray(retTy)) { 1575 unsigned size = 0; 1576 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1577 size = ITy->getBitWidth(); 1578 } else { 1579 assert(retTy->isFloatingPointTy() && 1580 "Floating point type expected here"); 1581 size = retTy->getPrimitiveSizeInBits(); 1582 } 1583 // PTX ABI requires all scalar return values to be at least 32 1584 // bits in size. fp16 normally uses .b16 as its storage type in 1585 // PTX, so its size must be adjusted here, too. 1586 size = promoteScalarArgumentSize(size); 1587 1588 O << ".param .b" << size << " _"; 1589 } else if (isa<PointerType>(retTy)) { 1590 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1591 } else if (IsTypePassedAsArray(retTy)) { 1592 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1593 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1594 } else { 1595 llvm_unreachable("Unknown return type"); 1596 } 1597 O << ") "; 1598 } 1599 O << "_ ("; 1600 1601 bool first = true; 1602 1603 const Function *F = CB.getFunction(); 1604 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1605 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1606 Type *Ty = Args[i].Ty; 1607 if (!first) { 1608 O << ", "; 1609 } 1610 first = false; 1611 1612 if (!Outs[OIdx].Flags.isByVal()) { 1613 if (IsTypePassedAsArray(Ty)) { 1614 unsigned ParamAlign = 0; 1615 const CallInst *CallI = cast<CallInst>(&CB); 1616 // +1 because index 0 is reserved for return type alignment 1617 if (!getAlign(*CallI, i + 1, ParamAlign)) 1618 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1619 O << ".param .align " << ParamAlign << " .b8 "; 1620 O << "_"; 1621 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1622 // update the index for Outs 1623 SmallVector<EVT, 16> vtparts; 1624 ComputeValueVTs(*this, DL, Ty, vtparts); 1625 if (unsigned len = vtparts.size()) 1626 OIdx += len - 1; 1627 continue; 1628 } 1629 // i8 types in IR will be i16 types in SDAG 1630 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1631 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1632 "type mismatch between callee prototype and arguments"); 1633 // scalar type 1634 unsigned sz = 0; 1635 if (isa<IntegerType>(Ty)) { 1636 sz = cast<IntegerType>(Ty)->getBitWidth(); 1637 sz = promoteScalarArgumentSize(sz); 1638 } else if (isa<PointerType>(Ty)) { 1639 sz = PtrVT.getSizeInBits(); 1640 } else { 1641 sz = Ty->getPrimitiveSizeInBits(); 1642 } 1643 O << ".param .b" << sz << " "; 1644 O << "_"; 1645 continue; 1646 } 1647 1648 Type *ETy = Args[i].IndirectType; 1649 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1650 Align ParamByValAlign = 1651 getFunctionByValParamAlign(F, ETy, InitialAlign, DL); 1652 1653 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1654 O << "_"; 1655 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1656 } 1657 1658 if (VAInfo) 1659 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1660 << " .b8 _[]\n"; 1661 O << ")"; 1662 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1663 O << " .noreturn"; 1664 O << ";"; 1665 1666 return Prototype; 1667 } 1668 1669 Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, 1670 unsigned Idx, 1671 const DataLayout &DL) const { 1672 if (!CB) { 1673 // CallSite is zero, fallback to ABI type alignment 1674 return DL.getABITypeAlign(Ty); 1675 } 1676 1677 unsigned Alignment = 0; 1678 const Function *DirectCallee = CB->getCalledFunction(); 1679 1680 if (!DirectCallee) { 1681 // We don't have a direct function symbol, but that may be because of 1682 // constant cast instructions in the call. 1683 1684 // With bitcast'd call targets, the instruction will be the call 1685 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1686 // Check if we have call alignment metadata 1687 if (getAlign(*CI, Idx, Alignment)) 1688 return Align(Alignment); 1689 } 1690 DirectCallee = getMaybeBitcastedCallee(CB); 1691 } 1692 1693 // Check for function alignment information if we found that the 1694 // ultimate target is a Function 1695 if (DirectCallee) { 1696 if (getAlign(*DirectCallee, Idx, Alignment)) 1697 return Align(Alignment); 1698 // If alignment information is not available, fall back to the 1699 // default function param optimized type alignment 1700 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1701 } 1702 1703 // Call is indirect, fall back to the ABI type alignment 1704 return DL.getABITypeAlign(Ty); 1705 } 1706 1707 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1708 SmallVectorImpl<SDValue> &InVals) const { 1709 1710 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1711 report_fatal_error( 1712 "Support for variadic functions (unsized array parameter) introduced " 1713 "in PTX ISA version 6.0 and requires target sm_30."); 1714 1715 SelectionDAG &DAG = CLI.DAG; 1716 SDLoc dl = CLI.DL; 1717 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1718 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1719 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1720 SDValue Chain = CLI.Chain; 1721 SDValue Callee = CLI.Callee; 1722 bool &isTailCall = CLI.IsTailCall; 1723 ArgListTy &Args = CLI.getArgs(); 1724 Type *RetTy = CLI.RetTy; 1725 const CallBase *CB = CLI.CB; 1726 const DataLayout &DL = DAG.getDataLayout(); 1727 1728 bool isABI = (STI.getSmVersion() >= 20); 1729 assert(isABI && "Non-ABI compilation is not supported"); 1730 if (!isABI) 1731 return Chain; 1732 1733 // Variadic arguments. 1734 // 1735 // Normally, for each argument, we declare a param scalar or a param 1736 // byte array in the .param space, and store the argument value to that 1737 // param scalar or array starting at offset 0. 1738 // 1739 // In the case of the first variadic argument, we declare a vararg byte array 1740 // with size 0. The exact size of this array isn't known at this point, so 1741 // it'll be patched later. All the variadic arguments will be stored to this 1742 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1743 // initially set to 0, so it can be used for non-variadic arguments (which use 1744 // 0 offset) to simplify the code. 1745 // 1746 // After all vararg is processed, 'VAOffset' holds the size of the 1747 // vararg byte array. 1748 1749 SDValue VADeclareParam; // vararg byte array 1750 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1751 unsigned VAOffset = 0; // current offset in the param array 1752 1753 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1754 SDValue TempChain = Chain; 1755 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1756 SDValue InGlue = Chain.getValue(1); 1757 1758 unsigned ParamCount = 0; 1759 // Args.size() and Outs.size() need not match. 1760 // Outs.size() will be larger 1761 // * if there is an aggregate argument with multiple fields (each field 1762 // showing up separately in Outs) 1763 // * if there is a vector argument with more than typical vector-length 1764 // elements (generally if more than 4) where each vector element is 1765 // individually present in Outs. 1766 // So a different index should be used for indexing into Outs/OutVals. 1767 // See similar issue in LowerFormalArguments. 1768 unsigned OIdx = 0; 1769 // Declare the .params or .reg need to pass values 1770 // to the function 1771 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1772 EVT VT = Outs[OIdx].VT; 1773 Type *Ty = Args[i].Ty; 1774 bool IsVAArg = (i >= CLI.NumFixedArgs); 1775 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1776 1777 SmallVector<EVT, 16> VTs; 1778 SmallVector<uint64_t, 16> Offsets; 1779 1780 assert((!IsByVal || Args[i].IndirectType) && 1781 "byval arg must have indirect type"); 1782 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1783 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1784 1785 Align ArgAlign; 1786 if (IsByVal) { 1787 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1788 // so we don't need to worry whether it's naturally aligned or not. 1789 // See TargetLowering::LowerCallTo(). 1790 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1791 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1792 InitialAlign, DL); 1793 if (IsVAArg) 1794 VAOffset = alignTo(VAOffset, ArgAlign); 1795 } else { 1796 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL); 1797 } 1798 1799 unsigned TypeSize = 1800 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1801 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1802 1803 bool NeedAlign; // Does argument declaration specify alignment? 1804 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1805 if (IsVAArg) { 1806 if (ParamCount == FirstVAArg) { 1807 SDValue DeclareParamOps[] = { 1808 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1809 DAG.getConstant(ParamCount, dl, MVT::i32), 1810 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1811 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1812 DeclareParamVTs, DeclareParamOps); 1813 } 1814 NeedAlign = PassAsArray; 1815 } else if (PassAsArray) { 1816 // declare .param .align <align> .b8 .param<n>[<size>]; 1817 SDValue DeclareParamOps[] = { 1818 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1819 DAG.getConstant(ParamCount, dl, MVT::i32), 1820 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1821 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1822 DeclareParamOps); 1823 NeedAlign = true; 1824 } else { 1825 // declare .param .b<size> .param<n>; 1826 if (VT.isInteger() || VT.isFloatingPoint()) { 1827 // PTX ABI requires integral types to be at least 32 bits in 1828 // size. FP16 is loaded/stored using i16, so it's handled 1829 // here as well. 1830 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1831 } 1832 SDValue DeclareScalarParamOps[] = { 1833 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1834 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1835 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1836 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1837 DeclareScalarParamOps); 1838 NeedAlign = false; 1839 } 1840 InGlue = Chain.getValue(1); 1841 1842 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1843 // than 32-bits are sign extended or zero extended, depending on 1844 // whether they are signed or unsigned types. This case applies 1845 // only to scalar parameters and not to aggregate values. 1846 bool ExtendIntegerParam = 1847 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1848 1849 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1850 SmallVector<SDValue, 6> StoreOperands; 1851 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1852 EVT EltVT = VTs[j]; 1853 int CurOffset = Offsets[j]; 1854 MaybeAlign PartAlign; 1855 if (NeedAlign) 1856 PartAlign = commonAlignment(ArgAlign, CurOffset); 1857 1858 // New store. 1859 if (VectorInfo[j] & PVF_FIRST) { 1860 assert(StoreOperands.empty() && "Unfinished preceding store."); 1861 StoreOperands.push_back(Chain); 1862 StoreOperands.push_back( 1863 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1864 StoreOperands.push_back(DAG.getConstant( 1865 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1866 dl, MVT::i32)); 1867 } 1868 1869 SDValue StVal = OutVals[OIdx]; 1870 1871 MVT PromotedVT; 1872 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1873 EltVT = EVT(PromotedVT); 1874 } 1875 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1876 llvm::ISD::NodeType Ext = 1877 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1878 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1879 } 1880 1881 if (IsByVal) { 1882 auto PtrVT = getPointerTy(DL); 1883 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1884 DAG.getConstant(CurOffset, dl, PtrVT)); 1885 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1886 PartAlign); 1887 } else if (ExtendIntegerParam) { 1888 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1889 // zext/sext to i32 1890 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1891 : ISD::ZERO_EXTEND, 1892 dl, MVT::i32, StVal); 1893 } 1894 1895 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1896 // Use 16-bit registers for small stores as it's the 1897 // smallest general purpose register size supported by NVPTX. 1898 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1899 } 1900 1901 // Record the value to store. 1902 StoreOperands.push_back(StVal); 1903 1904 if (VectorInfo[j] & PVF_LAST) { 1905 unsigned NumElts = StoreOperands.size() - 3; 1906 NVPTXISD::NodeType Op; 1907 switch (NumElts) { 1908 case 1: 1909 Op = NVPTXISD::StoreParam; 1910 break; 1911 case 2: 1912 Op = NVPTXISD::StoreParamV2; 1913 break; 1914 case 4: 1915 Op = NVPTXISD::StoreParamV4; 1916 break; 1917 default: 1918 llvm_unreachable("Invalid vector info."); 1919 } 1920 1921 StoreOperands.push_back(InGlue); 1922 1923 // Adjust type of the store op if we've extended the scalar 1924 // return value. 1925 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1926 1927 Chain = DAG.getMemIntrinsicNode( 1928 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1929 TheStoreType, MachinePointerInfo(), PartAlign, 1930 MachineMemOperand::MOStore); 1931 InGlue = Chain.getValue(1); 1932 1933 // Cleanup. 1934 StoreOperands.clear(); 1935 1936 // TODO: We may need to support vector types that can be passed 1937 // as scalars in variadic arguments. 1938 if (!IsByVal && IsVAArg) { 1939 assert(NumElts == 1 && 1940 "Vectorization is expected to be disabled for variadics."); 1941 VAOffset += DL.getTypeAllocSize( 1942 TheStoreType.getTypeForEVT(*DAG.getContext())); 1943 } 1944 } 1945 if (!IsByVal) 1946 ++OIdx; 1947 } 1948 assert(StoreOperands.empty() && "Unfinished parameter store."); 1949 if (!IsByVal && VTs.size() > 0) 1950 --OIdx; 1951 ++ParamCount; 1952 if (IsByVal && IsVAArg) 1953 VAOffset += TypeSize; 1954 } 1955 1956 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1957 MaybeAlign retAlignment = std::nullopt; 1958 1959 // Handle Result 1960 if (Ins.size() > 0) { 1961 SmallVector<EVT, 16> resvtparts; 1962 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1963 1964 // Declare 1965 // .param .align N .b8 retval0[<size-in-bytes>], or 1966 // .param .b<size-in-bits> retval0 1967 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1968 if (!IsTypePassedAsArray(RetTy)) { 1969 resultsz = promoteScalarArgumentSize(resultsz); 1970 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1971 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1972 DAG.getConstant(resultsz, dl, MVT::i32), 1973 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1974 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1975 DeclareRetOps); 1976 InGlue = Chain.getValue(1); 1977 } else { 1978 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL); 1979 assert(retAlignment && "retAlignment is guaranteed to be set"); 1980 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1981 SDValue DeclareRetOps[] = { 1982 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1983 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1984 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1985 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1986 DeclareRetOps); 1987 InGlue = Chain.getValue(1); 1988 } 1989 } 1990 1991 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1992 // Set the size of the vararg param byte array if the callee is a variadic 1993 // function and the variadic part is not empty. 1994 if (HasVAArgs) { 1995 SDValue DeclareParamOps[] = { 1996 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1997 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1998 VADeclareParam.getOperand(4)}; 1999 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 2000 VADeclareParam->getVTList(), DeclareParamOps); 2001 } 2002 2003 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 2004 // between them we must rely on the call site value which is valid for 2005 // indirect calls but is always null for libcalls. 2006 bool isIndirectCall = !Func && CB; 2007 2008 if (isa<ExternalSymbolSDNode>(Callee)) { 2009 Function* CalleeFunc = nullptr; 2010 2011 // Try to find the callee in the current module. 2012 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 2013 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 2014 2015 // Set the "libcall callee" attribute to indicate that the function 2016 // must always have a declaration. 2017 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 2018 } 2019 2020 if (isIndirectCall) { 2021 // This is indirect function call case : PTX requires a prototype of the 2022 // form 2023 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 2024 // to be emitted, and the label has to used as the last arg of call 2025 // instruction. 2026 // The prototype is embedded in a string and put as the operand for a 2027 // CallPrototype SDNode which will print out to the value of the string. 2028 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2029 std::string Proto = getPrototype( 2030 DL, RetTy, Args, Outs, retAlignment, 2031 HasVAArgs 2032 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 2033 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1))) 2034 : std::nullopt, 2035 *CB, UniqueCallSite); 2036 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 2037 SDValue ProtoOps[] = { 2038 Chain, 2039 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 2040 InGlue, 2041 }; 2042 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 2043 InGlue = Chain.getValue(1); 2044 } 2045 // Op to just print "call" 2046 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2047 SDValue PrintCallOps[] = { 2048 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 2049 }; 2050 // We model convergent calls as separate opcodes. 2051 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 2052 if (CLI.IsConvergent) 2053 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 2054 : NVPTXISD::PrintConvergentCall; 2055 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 2056 InGlue = Chain.getValue(1); 2057 2058 // Ops to print out the function name 2059 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2060 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 2061 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 2062 InGlue = Chain.getValue(1); 2063 2064 // Ops to print out the param list 2065 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2066 SDValue CallArgBeginOps[] = { Chain, InGlue }; 2067 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 2068 CallArgBeginOps); 2069 InGlue = Chain.getValue(1); 2070 2071 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 2072 ++i) { 2073 unsigned opcode; 2074 if (i == (e - 1)) 2075 opcode = NVPTXISD::LastCallArg; 2076 else 2077 opcode = NVPTXISD::CallArg; 2078 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2079 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 2080 DAG.getConstant(i, dl, MVT::i32), InGlue }; 2081 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 2082 InGlue = Chain.getValue(1); 2083 } 2084 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2085 SDValue CallArgEndOps[] = { Chain, 2086 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 2087 InGlue }; 2088 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 2089 InGlue = Chain.getValue(1); 2090 2091 if (isIndirectCall) { 2092 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2093 SDValue PrototypeOps[] = { 2094 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 2095 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 2096 InGlue = Chain.getValue(1); 2097 } 2098 2099 SmallVector<SDValue, 16> ProxyRegOps; 2100 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 2101 2102 // Generate loads from param memory/moves from registers for result 2103 if (Ins.size() > 0) { 2104 SmallVector<EVT, 16> VTs; 2105 SmallVector<uint64_t, 16> Offsets; 2106 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 2107 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 2108 2109 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); 2110 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 2111 2112 SmallVector<EVT, 6> LoadVTs; 2113 int VecIdx = -1; // Index of the first element of the vector. 2114 2115 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2116 // 32-bits are sign extended or zero extended, depending on whether 2117 // they are signed or unsigned types. 2118 bool ExtendIntegerRetVal = 2119 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2120 2121 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2122 bool needTruncate = false; 2123 EVT TheLoadType = VTs[i]; 2124 EVT EltType = Ins[i].VT; 2125 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 2126 MVT PromotedVT; 2127 2128 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 2129 TheLoadType = EVT(PromotedVT); 2130 EltType = EVT(PromotedVT); 2131 needTruncate = true; 2132 } 2133 2134 if (ExtendIntegerRetVal) { 2135 TheLoadType = MVT::i32; 2136 EltType = MVT::i32; 2137 needTruncate = true; 2138 } else if (TheLoadType.getSizeInBits() < 16) { 2139 if (VTs[i].isInteger()) 2140 needTruncate = true; 2141 EltType = MVT::i16; 2142 } 2143 2144 // Record index of the very first element of the vector. 2145 if (VectorInfo[i] & PVF_FIRST) { 2146 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2147 VecIdx = i; 2148 } 2149 2150 LoadVTs.push_back(EltType); 2151 2152 if (VectorInfo[i] & PVF_LAST) { 2153 unsigned NumElts = LoadVTs.size(); 2154 LoadVTs.push_back(MVT::Other); 2155 LoadVTs.push_back(MVT::Glue); 2156 NVPTXISD::NodeType Op; 2157 switch (NumElts) { 2158 case 1: 2159 Op = NVPTXISD::LoadParam; 2160 break; 2161 case 2: 2162 Op = NVPTXISD::LoadParamV2; 2163 break; 2164 case 4: 2165 Op = NVPTXISD::LoadParamV4; 2166 break; 2167 default: 2168 llvm_unreachable("Invalid vector info."); 2169 } 2170 2171 SDValue LoadOperands[] = { 2172 Chain, DAG.getConstant(1, dl, MVT::i32), 2173 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2174 SDValue RetVal = DAG.getMemIntrinsicNode( 2175 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2176 MachinePointerInfo(), EltAlign, 2177 MachineMemOperand::MOLoad); 2178 2179 for (unsigned j = 0; j < NumElts; ++j) { 2180 ProxyRegOps.push_back(RetVal.getValue(j)); 2181 2182 if (needTruncate) 2183 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2184 else 2185 ProxyRegTruncates.push_back(std::optional<MVT>()); 2186 } 2187 2188 Chain = RetVal.getValue(NumElts); 2189 InGlue = RetVal.getValue(NumElts + 1); 2190 2191 // Cleanup 2192 VecIdx = -1; 2193 LoadVTs.clear(); 2194 } 2195 } 2196 } 2197 2198 Chain = 2199 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2200 InGlue = Chain.getValue(1); 2201 2202 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2203 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2204 // dangling. 2205 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2206 SDValue Ret = DAG.getNode( 2207 NVPTXISD::ProxyReg, dl, 2208 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2209 { Chain, ProxyRegOps[i], InGlue } 2210 ); 2211 2212 Chain = Ret.getValue(1); 2213 InGlue = Ret.getValue(2); 2214 2215 if (ProxyRegTruncates[i]) { 2216 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2217 } 2218 2219 InVals.push_back(Ret); 2220 } 2221 2222 // set isTailCall to false for now, until we figure out how to express 2223 // tail call optimization in PTX 2224 isTailCall = false; 2225 return Chain; 2226 } 2227 2228 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 2229 SelectionDAG &DAG) const { 2230 const Function &Fn = DAG.getMachineFunction().getFunction(); 2231 2232 DiagnosticInfoUnsupported NoDynamicAlloca( 2233 Fn, "dynamic alloca unsupported by NVPTX backend", 2234 SDLoc(Op).getDebugLoc()); 2235 DAG.getContext()->diagnose(NoDynamicAlloca); 2236 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 2237 return DAG.getMergeValues(Ops, SDLoc()); 2238 } 2239 2240 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2241 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2242 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2243 SDValue 2244 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2245 SDNode *Node = Op.getNode(); 2246 SDLoc dl(Node); 2247 SmallVector<SDValue, 8> Ops; 2248 unsigned NumOperands = Node->getNumOperands(); 2249 for (unsigned i = 0; i < NumOperands; ++i) { 2250 SDValue SubOp = Node->getOperand(i); 2251 EVT VVT = SubOp.getNode()->getValueType(0); 2252 EVT EltVT = VVT.getVectorElementType(); 2253 unsigned NumSubElem = VVT.getVectorNumElements(); 2254 for (unsigned j = 0; j < NumSubElem; ++j) { 2255 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2256 DAG.getIntPtrConstant(j, dl))); 2257 } 2258 } 2259 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2260 } 2261 2262 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2263 // would get lowered as two constant loads and vector-packing move. 2264 // Instead we want just a constant move: 2265 // mov.b32 %r2, 0x40003C00 2266 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2267 SelectionDAG &DAG) const { 2268 EVT VT = Op->getValueType(0); 2269 if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) 2270 return Op; 2271 2272 SDLoc DL(Op); 2273 2274 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { 2275 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || 2276 isa<ConstantFPSDNode>(Operand); 2277 })) { 2278 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2279 // to optimize calculation of constant parts. 2280 if (VT == MVT::v4i8) { 2281 SDValue C8 = DAG.getConstant(8, DL, MVT::i32); 2282 SDValue E01 = DAG.getNode( 2283 NVPTXISD::BFI, DL, MVT::i32, 2284 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), 2285 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); 2286 SDValue E012 = 2287 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2288 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), 2289 E01, DAG.getConstant(16, DL, MVT::i32), C8); 2290 SDValue E0123 = 2291 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2292 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), 2293 E012, DAG.getConstant(24, DL, MVT::i32), C8); 2294 return DAG.getNode(ISD::BITCAST, DL, VT, E0123); 2295 } 2296 return Op; 2297 } 2298 2299 // Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2300 auto GetOperand = [](SDValue Op, int N) -> APInt { 2301 const SDValue &Operand = Op->getOperand(N); 2302 EVT VT = Op->getValueType(0); 2303 if (Operand->isUndef()) 2304 return APInt(32, 0); 2305 APInt Value; 2306 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2307 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); 2308 else if (VT == MVT::v2i16 || VT == MVT::v4i8) 2309 Value = Operand->getAsAPIntVal(); 2310 else 2311 llvm_unreachable("Unsupported type"); 2312 // i8 values are carried around as i16, so we need to zero out upper bits, 2313 // so they do not get in the way of combining individual byte values 2314 if (VT == MVT::v4i8) 2315 Value = Value.trunc(8); 2316 return Value.zext(32); 2317 }; 2318 APInt Value; 2319 if (Isv2x16VT(VT)) { 2320 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); 2321 } else if (VT == MVT::v4i8) { 2322 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | 2323 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); 2324 } else { 2325 llvm_unreachable("Unsupported type"); 2326 } 2327 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); 2328 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2329 } 2330 2331 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2332 SelectionDAG &DAG) const { 2333 SDValue Index = Op->getOperand(1); 2334 SDValue Vector = Op->getOperand(0); 2335 SDLoc DL(Op); 2336 EVT VectorVT = Vector.getValueType(); 2337 2338 if (VectorVT == MVT::v4i8) { 2339 SDValue BFE = 2340 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, 2341 {Vector, 2342 DAG.getNode(ISD::MUL, DL, MVT::i32, 2343 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2344 DAG.getConstant(8, DL, MVT::i32)), 2345 DAG.getConstant(8, DL, MVT::i32)}); 2346 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); 2347 } 2348 2349 // Constant index will be matched by tablegen. 2350 if (isa<ConstantSDNode>(Index.getNode())) 2351 return Op; 2352 2353 // Extract individual elements and select one of them. 2354 assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); 2355 EVT EltVT = VectorVT.getVectorElementType(); 2356 2357 SDLoc dl(Op.getNode()); 2358 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2359 DAG.getIntPtrConstant(0, dl)); 2360 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2361 DAG.getIntPtrConstant(1, dl)); 2362 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2363 ISD::CondCode::SETEQ); 2364 } 2365 2366 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 2367 SelectionDAG &DAG) const { 2368 SDValue Vector = Op->getOperand(0); 2369 EVT VectorVT = Vector.getValueType(); 2370 2371 if (VectorVT != MVT::v4i8) 2372 return Op; 2373 SDLoc DL(Op); 2374 SDValue Value = Op->getOperand(1); 2375 if (Value->isUndef()) 2376 return Vector; 2377 2378 SDValue Index = Op->getOperand(2); 2379 2380 SDValue BFI = 2381 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2382 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, 2383 DAG.getNode(ISD::MUL, DL, MVT::i32, 2384 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2385 DAG.getConstant(8, DL, MVT::i32)), 2386 DAG.getConstant(8, DL, MVT::i32)}); 2387 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); 2388 } 2389 2390 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 2391 SelectionDAG &DAG) const { 2392 SDValue V1 = Op.getOperand(0); 2393 EVT VectorVT = V1.getValueType(); 2394 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) 2395 return Op; 2396 2397 // Lower shuffle to PRMT instruction. 2398 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 2399 SDValue V2 = Op.getOperand(1); 2400 uint32_t Selector = 0; 2401 for (auto I : llvm::enumerate(SVN->getMask())) { 2402 if (I.value() != -1) // -1 is a placeholder for undef. 2403 Selector |= (I.value() << (I.index() * 4)); 2404 } 2405 2406 SDLoc DL(Op); 2407 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, 2408 DAG.getConstant(Selector, DL, MVT::i32), 2409 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); 2410 } 2411 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2412 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2413 /// amount, or 2414 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2415 /// amount. 2416 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2417 SelectionDAG &DAG) const { 2418 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2419 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2420 2421 EVT VT = Op.getValueType(); 2422 unsigned VTBits = VT.getSizeInBits(); 2423 SDLoc dl(Op); 2424 SDValue ShOpLo = Op.getOperand(0); 2425 SDValue ShOpHi = Op.getOperand(1); 2426 SDValue ShAmt = Op.getOperand(2); 2427 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2428 2429 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2430 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2431 // {dHi, dLo} = {aHi, aLo} >> Amt 2432 // dHi = aHi >> Amt 2433 // dLo = shf.r.clamp aLo, aHi, Amt 2434 2435 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2436 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2437 ShAmt); 2438 2439 SDValue Ops[2] = { Lo, Hi }; 2440 return DAG.getMergeValues(Ops, dl); 2441 } 2442 else { 2443 // {dHi, dLo} = {aHi, aLo} >> Amt 2444 // - if (Amt>=size) then 2445 // dLo = aHi >> (Amt-size) 2446 // dHi = aHi >> Amt (this is either all 0 or all 1) 2447 // else 2448 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2449 // dHi = aHi >> Amt 2450 2451 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2452 DAG.getConstant(VTBits, dl, MVT::i32), 2453 ShAmt); 2454 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2455 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2456 DAG.getConstant(VTBits, dl, MVT::i32)); 2457 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2458 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2459 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2460 2461 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2462 DAG.getConstant(VTBits, dl, MVT::i32), 2463 ISD::SETGE); 2464 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2465 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2466 2467 SDValue Ops[2] = { Lo, Hi }; 2468 return DAG.getMergeValues(Ops, dl); 2469 } 2470 } 2471 2472 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2473 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2474 /// amount, or 2475 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2476 /// amount. 2477 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2478 SelectionDAG &DAG) const { 2479 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2480 assert(Op.getOpcode() == ISD::SHL_PARTS); 2481 2482 EVT VT = Op.getValueType(); 2483 unsigned VTBits = VT.getSizeInBits(); 2484 SDLoc dl(Op); 2485 SDValue ShOpLo = Op.getOperand(0); 2486 SDValue ShOpHi = Op.getOperand(1); 2487 SDValue ShAmt = Op.getOperand(2); 2488 2489 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2490 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2491 // {dHi, dLo} = {aHi, aLo} << Amt 2492 // dHi = shf.l.clamp aLo, aHi, Amt 2493 // dLo = aLo << Amt 2494 2495 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2496 ShAmt); 2497 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2498 2499 SDValue Ops[2] = { Lo, Hi }; 2500 return DAG.getMergeValues(Ops, dl); 2501 } 2502 else { 2503 // {dHi, dLo} = {aHi, aLo} << Amt 2504 // - if (Amt>=size) then 2505 // dLo = aLo << Amt (all 0) 2506 // dLo = aLo << (Amt-size) 2507 // else 2508 // dLo = aLo << Amt 2509 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2510 2511 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2512 DAG.getConstant(VTBits, dl, MVT::i32), 2513 ShAmt); 2514 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2515 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2516 DAG.getConstant(VTBits, dl, MVT::i32)); 2517 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2518 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2519 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2520 2521 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2522 DAG.getConstant(VTBits, dl, MVT::i32), 2523 ISD::SETGE); 2524 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2525 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2526 2527 SDValue Ops[2] = { Lo, Hi }; 2528 return DAG.getMergeValues(Ops, dl); 2529 } 2530 } 2531 2532 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2533 EVT VT = Op.getValueType(); 2534 2535 if (VT == MVT::f32) 2536 return LowerFROUND32(Op, DAG); 2537 2538 if (VT == MVT::f64) 2539 return LowerFROUND64(Op, DAG); 2540 2541 llvm_unreachable("unhandled type"); 2542 } 2543 2544 // This is the the rounding method used in CUDA libdevice in C like code: 2545 // float roundf(float A) 2546 // { 2547 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2548 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2549 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2550 // } 2551 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2552 SelectionDAG &DAG) const { 2553 SDLoc SL(Op); 2554 SDValue A = Op.getOperand(0); 2555 EVT VT = Op.getValueType(); 2556 2557 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2558 2559 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2560 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2561 const int SignBitMask = 0x80000000; 2562 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2563 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2564 const int PointFiveInBits = 0x3F000000; 2565 SDValue PointFiveWithSignRaw = 2566 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2567 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2568 SDValue PointFiveWithSign = 2569 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2570 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2571 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2572 2573 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2574 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2575 SDValue IsLarge = 2576 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2577 ISD::SETOGT); 2578 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2579 2580 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2581 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2582 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2583 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2584 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2585 } 2586 2587 // The implementation of round(double) is similar to that of round(float) in 2588 // that they both separate the value range into three regions and use a method 2589 // specific to the region to round the values. However, round(double) first 2590 // calculates the round of the absolute value and then adds the sign back while 2591 // round(float) directly rounds the value with sign. 2592 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2593 SelectionDAG &DAG) const { 2594 SDLoc SL(Op); 2595 SDValue A = Op.getOperand(0); 2596 EVT VT = Op.getValueType(); 2597 2598 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2599 2600 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2601 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2602 DAG.getConstantFP(0.5, SL, VT)); 2603 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2604 2605 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2606 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2607 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2608 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2609 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2610 DAG.getConstantFP(0, SL, VT), 2611 RoundedA); 2612 2613 // Add sign to rounded_A 2614 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2615 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2616 2617 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2618 SDValue IsLarge = 2619 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2620 ISD::SETOGT); 2621 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2622 } 2623 2624 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, 2625 SelectionDAG &DAG) const { 2626 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2627 2628 if (Op.getValueType() == MVT::bf16) { 2629 SDLoc Loc(Op); 2630 return DAG.getNode( 2631 ISD::FP_ROUND, Loc, MVT::bf16, 2632 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), 2633 DAG.getIntPtrConstant(0, Loc)); 2634 } 2635 2636 // Everything else is considered legal. 2637 return Op; 2638 } 2639 2640 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, 2641 SelectionDAG &DAG) const { 2642 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2643 2644 if (Op.getOperand(0).getValueType() == MVT::bf16) { 2645 SDLoc Loc(Op); 2646 return DAG.getNode( 2647 Op.getOpcode(), Loc, Op.getValueType(), 2648 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); 2649 } 2650 2651 // Everything else is considered legal. 2652 return Op; 2653 } 2654 2655 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { 2656 SDLoc DL(Op); 2657 if (Op.getValueType() != MVT::v2i16) 2658 return Op; 2659 EVT EltVT = Op.getValueType().getVectorElementType(); 2660 SmallVector<SDValue> VecElements; 2661 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { 2662 SmallVector<SDValue> ScalarArgs; 2663 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), 2664 [&](const SDUse &O) { 2665 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 2666 O.get(), DAG.getIntPtrConstant(I, DL)); 2667 }); 2668 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); 2669 } 2670 SDValue V = 2671 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); 2672 return V; 2673 } 2674 2675 SDValue 2676 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2677 switch (Op.getOpcode()) { 2678 case ISD::RETURNADDR: 2679 return SDValue(); 2680 case ISD::FRAMEADDR: 2681 return SDValue(); 2682 case ISD::GlobalAddress: 2683 return LowerGlobalAddress(Op, DAG); 2684 case ISD::INTRINSIC_W_CHAIN: 2685 return Op; 2686 case ISD::BUILD_VECTOR: 2687 return LowerBUILD_VECTOR(Op, DAG); 2688 case ISD::EXTRACT_SUBVECTOR: 2689 return Op; 2690 case ISD::EXTRACT_VECTOR_ELT: 2691 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2692 case ISD::INSERT_VECTOR_ELT: 2693 return LowerINSERT_VECTOR_ELT(Op, DAG); 2694 case ISD::VECTOR_SHUFFLE: 2695 return LowerVECTOR_SHUFFLE(Op, DAG); 2696 case ISD::CONCAT_VECTORS: 2697 return LowerCONCAT_VECTORS(Op, DAG); 2698 case ISD::STORE: 2699 return LowerSTORE(Op, DAG); 2700 case ISD::LOAD: 2701 return LowerLOAD(Op, DAG); 2702 case ISD::SHL_PARTS: 2703 return LowerShiftLeftParts(Op, DAG); 2704 case ISD::SRA_PARTS: 2705 case ISD::SRL_PARTS: 2706 return LowerShiftRightParts(Op, DAG); 2707 case ISD::SELECT: 2708 return LowerSelect(Op, DAG); 2709 case ISD::FROUND: 2710 return LowerFROUND(Op, DAG); 2711 case ISD::SINT_TO_FP: 2712 case ISD::UINT_TO_FP: 2713 return LowerINT_TO_FP(Op, DAG); 2714 case ISD::FP_TO_SINT: 2715 case ISD::FP_TO_UINT: 2716 return LowerFP_TO_INT(Op, DAG); 2717 case ISD::VAARG: 2718 return LowerVAARG(Op, DAG); 2719 case ISD::VASTART: 2720 return LowerVASTART(Op, DAG); 2721 case ISD::ABS: 2722 case ISD::SMIN: 2723 case ISD::SMAX: 2724 case ISD::UMIN: 2725 case ISD::UMAX: 2726 case ISD::ADD: 2727 case ISD::SUB: 2728 case ISD::MUL: 2729 case ISD::SHL: 2730 case ISD::SREM: 2731 case ISD::UREM: 2732 return LowerVectorArith(Op, DAG); 2733 case ISD::DYNAMIC_STACKALLOC: 2734 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2735 default: 2736 llvm_unreachable("Custom lowering not defined for operation"); 2737 } 2738 } 2739 2740 // This function is almost a copy of SelectionDAG::expandVAArg(). 2741 // The only diff is that this one produces loads from local address space. 2742 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2743 const TargetLowering *TLI = STI.getTargetLowering(); 2744 SDLoc DL(Op); 2745 2746 SDNode *Node = Op.getNode(); 2747 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2748 EVT VT = Node->getValueType(0); 2749 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2750 SDValue Tmp1 = Node->getOperand(0); 2751 SDValue Tmp2 = Node->getOperand(1); 2752 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2753 2754 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2755 Tmp1, Tmp2, MachinePointerInfo(V)); 2756 SDValue VAList = VAListLoad; 2757 2758 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2759 VAList = DAG.getNode( 2760 ISD::ADD, DL, VAList.getValueType(), VAList, 2761 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2762 2763 VAList = DAG.getNode( 2764 ISD::AND, DL, VAList.getValueType(), VAList, 2765 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2766 } 2767 2768 // Increment the pointer, VAList, to the next vaarg 2769 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2770 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2771 DL, VAList.getValueType())); 2772 2773 // Store the incremented VAList to the legalized pointer 2774 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2775 MachinePointerInfo(V)); 2776 2777 const Value *SrcV = 2778 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2779 2780 // Load the actual argument out of the pointer VAList 2781 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2782 } 2783 2784 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2785 const TargetLowering *TLI = STI.getTargetLowering(); 2786 SDLoc DL(Op); 2787 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2788 2789 // Store the address of unsized array <function>_vararg[] in the ap object. 2790 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2791 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2792 2793 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2794 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2795 MachinePointerInfo(SV)); 2796 } 2797 2798 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2799 SDValue Op0 = Op->getOperand(0); 2800 SDValue Op1 = Op->getOperand(1); 2801 SDValue Op2 = Op->getOperand(2); 2802 SDLoc DL(Op.getNode()); 2803 2804 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2805 2806 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2807 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2808 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2809 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2810 2811 return Trunc; 2812 } 2813 2814 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2815 if (Op.getValueType() == MVT::i1) 2816 return LowerLOADi1(Op, DAG); 2817 2818 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2819 // unaligned loads and have to handle it here. 2820 EVT VT = Op.getValueType(); 2821 if (Isv2x16VT(VT) || VT == MVT::v4i8) { 2822 LoadSDNode *Load = cast<LoadSDNode>(Op); 2823 EVT MemVT = Load->getMemoryVT(); 2824 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2825 MemVT, *Load->getMemOperand())) { 2826 SDValue Ops[2]; 2827 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2828 return DAG.getMergeValues(Ops, SDLoc(Op)); 2829 } 2830 } 2831 2832 return SDValue(); 2833 } 2834 2835 // v = ld i1* addr 2836 // => 2837 // v1 = ld i8* addr (-> i16) 2838 // v = trunc i16 to i1 2839 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2840 SDNode *Node = Op.getNode(); 2841 LoadSDNode *LD = cast<LoadSDNode>(Node); 2842 SDLoc dl(Node); 2843 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2844 assert(Node->getValueType(0) == MVT::i1 && 2845 "Custom lowering for i1 load only"); 2846 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2847 LD->getPointerInfo(), LD->getAlign(), 2848 LD->getMemOperand()->getFlags()); 2849 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2850 // The legalizer (the caller) is expecting two values from the legalized 2851 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2852 // in LegalizeDAG.cpp which also uses MergeValues. 2853 SDValue Ops[] = { result, LD->getChain() }; 2854 return DAG.getMergeValues(Ops, dl); 2855 } 2856 2857 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2858 StoreSDNode *Store = cast<StoreSDNode>(Op); 2859 EVT VT = Store->getMemoryVT(); 2860 2861 if (VT == MVT::i1) 2862 return LowerSTOREi1(Op, DAG); 2863 2864 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2865 // stores and have to handle it here. 2866 if ((Isv2x16VT(VT) || VT == MVT::v4i8) && 2867 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2868 VT, *Store->getMemOperand())) 2869 return expandUnalignedStore(Store, DAG); 2870 2871 // v2f16, v2bf16 and v2i16 don't need special handling. 2872 if (Isv2x16VT(VT) || VT == MVT::v4i8) 2873 return SDValue(); 2874 2875 if (VT.isVector()) 2876 return LowerSTOREVector(Op, DAG); 2877 2878 return SDValue(); 2879 } 2880 2881 SDValue 2882 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2883 SDNode *N = Op.getNode(); 2884 SDValue Val = N->getOperand(1); 2885 SDLoc DL(N); 2886 EVT ValVT = Val.getValueType(); 2887 2888 if (ValVT.isVector()) { 2889 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2890 // legal. We can (and should) split that into 2 stores of <2 x double> here 2891 // but I'm leaving that as a TODO for now. 2892 if (!ValVT.isSimple()) 2893 return SDValue(); 2894 switch (ValVT.getSimpleVT().SimpleTy) { 2895 default: 2896 return SDValue(); 2897 case MVT::v2i8: 2898 case MVT::v2i16: 2899 case MVT::v2i32: 2900 case MVT::v2i64: 2901 case MVT::v2f16: 2902 case MVT::v2bf16: 2903 case MVT::v2f32: 2904 case MVT::v2f64: 2905 case MVT::v4i8: 2906 case MVT::v4i16: 2907 case MVT::v4i32: 2908 case MVT::v4f16: 2909 case MVT::v4bf16: 2910 case MVT::v4f32: 2911 case MVT::v8f16: // <4 x f16x2> 2912 case MVT::v8bf16: // <4 x bf16x2> 2913 case MVT::v8i16: // <4 x i16x2> 2914 // This is a "native" vector type 2915 break; 2916 } 2917 2918 MemSDNode *MemSD = cast<MemSDNode>(N); 2919 const DataLayout &TD = DAG.getDataLayout(); 2920 2921 Align Alignment = MemSD->getAlign(); 2922 Align PrefAlign = 2923 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2924 if (Alignment < PrefAlign) { 2925 // This store is not sufficiently aligned, so bail out and let this vector 2926 // store be scalarized. Note that we may still be able to emit smaller 2927 // vector stores. For example, if we are storing a <4 x float> with an 2928 // alignment of 8, this check will fail but the legalizer will try again 2929 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2930 return SDValue(); 2931 } 2932 2933 unsigned Opcode = 0; 2934 EVT EltVT = ValVT.getVectorElementType(); 2935 unsigned NumElts = ValVT.getVectorNumElements(); 2936 2937 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2938 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2939 // stored type to i16 and propagate the "real" type as the memory type. 2940 bool NeedExt = false; 2941 if (EltVT.getSizeInBits() < 16) 2942 NeedExt = true; 2943 2944 bool StoreF16x2 = false; 2945 switch (NumElts) { 2946 default: 2947 return SDValue(); 2948 case 2: 2949 Opcode = NVPTXISD::StoreV2; 2950 break; 2951 case 4: 2952 Opcode = NVPTXISD::StoreV4; 2953 break; 2954 case 8: 2955 // v8f16 is a special case. PTX doesn't have st.v8.f16 2956 // instruction. Instead, we split the vector into v2f16 chunks and 2957 // store them with st.v4.b32. 2958 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); 2959 Opcode = NVPTXISD::StoreV4; 2960 StoreF16x2 = true; 2961 break; 2962 } 2963 2964 SmallVector<SDValue, 8> Ops; 2965 2966 // First is the chain 2967 Ops.push_back(N->getOperand(0)); 2968 2969 if (StoreF16x2) { 2970 // Combine f16,f16 -> v2f16 2971 NumElts /= 2; 2972 for (unsigned i = 0; i < NumElts; ++i) { 2973 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2974 DAG.getIntPtrConstant(i * 2, DL)); 2975 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2976 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2977 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 2978 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 2979 Ops.push_back(V2); 2980 } 2981 } else { 2982 // Then the split values 2983 for (unsigned i = 0; i < NumElts; ++i) { 2984 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2985 DAG.getIntPtrConstant(i, DL)); 2986 if (NeedExt) 2987 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2988 Ops.push_back(ExtVal); 2989 } 2990 } 2991 2992 // Then any remaining arguments 2993 Ops.append(N->op_begin() + 2, N->op_end()); 2994 2995 SDValue NewSt = 2996 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2997 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2998 2999 // return DCI.CombineTo(N, NewSt, true); 3000 return NewSt; 3001 } 3002 3003 return SDValue(); 3004 } 3005 3006 // st i1 v, addr 3007 // => 3008 // v1 = zxt v to i16 3009 // st.u8 i16, addr 3010 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 3011 SDNode *Node = Op.getNode(); 3012 SDLoc dl(Node); 3013 StoreSDNode *ST = cast<StoreSDNode>(Node); 3014 SDValue Tmp1 = ST->getChain(); 3015 SDValue Tmp2 = ST->getBasePtr(); 3016 SDValue Tmp3 = ST->getValue(); 3017 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 3018 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 3019 SDValue Result = 3020 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 3021 ST->getAlign(), ST->getMemOperand()->getFlags()); 3022 return Result; 3023 } 3024 3025 // This creates target external symbol for a function parameter. 3026 // Name of the symbol is composed from its index and the function name. 3027 // Negative index corresponds to special parameter (unsized array) used for 3028 // passing variable arguments. 3029 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 3030 EVT v) const { 3031 StringRef SavedStr = nvTM->getStrPool().save( 3032 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 3033 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 3034 } 3035 3036 SDValue NVPTXTargetLowering::LowerFormalArguments( 3037 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3040 MachineFunction &MF = DAG.getMachineFunction(); 3041 const DataLayout &DL = DAG.getDataLayout(); 3042 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3043 3044 const Function *F = &MF.getFunction(); 3045 const AttributeList &PAL = F->getAttributes(); 3046 const TargetLowering *TLI = STI.getTargetLowering(); 3047 3048 SDValue Root = DAG.getRoot(); 3049 std::vector<SDValue> OutChains; 3050 3051 bool isABI = (STI.getSmVersion() >= 20); 3052 assert(isABI && "Non-ABI compilation is not supported"); 3053 if (!isABI) 3054 return Chain; 3055 3056 std::vector<Type *> argTypes; 3057 std::vector<const Argument *> theArgs; 3058 for (const Argument &I : F->args()) { 3059 theArgs.push_back(&I); 3060 argTypes.push_back(I.getType()); 3061 } 3062 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3063 // Ins.size() will be larger 3064 // * if there is an aggregate argument with multiple fields (each field 3065 // showing up separately in Ins) 3066 // * if there is a vector argument with more than typical vector-length 3067 // elements (generally if more than 4) where each vector element is 3068 // individually present in Ins. 3069 // So a different index should be used for indexing into Ins. 3070 // See similar issue in LowerCall. 3071 unsigned InsIdx = 0; 3072 3073 int idx = 0; 3074 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 3075 Type *Ty = argTypes[i]; 3076 3077 if (theArgs[i]->use_empty()) { 3078 // argument is dead 3079 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 3080 SmallVector<EVT, 16> vtparts; 3081 3082 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 3083 if (vtparts.empty()) 3084 report_fatal_error("Empty parameter types are not supported"); 3085 3086 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 3087 ++parti) { 3088 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3089 ++InsIdx; 3090 } 3091 if (vtparts.size() > 0) 3092 --InsIdx; 3093 continue; 3094 } 3095 if (Ty->isVectorTy()) { 3096 EVT ObjectVT = getValueType(DL, Ty); 3097 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 3098 for (unsigned parti = 0; parti < NumRegs; ++parti) { 3099 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3100 ++InsIdx; 3101 } 3102 if (NumRegs > 0) 3103 --InsIdx; 3104 continue; 3105 } 3106 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3107 continue; 3108 } 3109 3110 // In the following cases, assign a node order of "idx+1" 3111 // to newly created nodes. The SDNodes for params have to 3112 // appear in the same order as their order of appearance 3113 // in the original function. "idx+1" holds that order. 3114 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 3115 bool aggregateIsPacked = false; 3116 if (StructType *STy = dyn_cast<StructType>(Ty)) 3117 aggregateIsPacked = STy->isPacked(); 3118 3119 SmallVector<EVT, 16> VTs; 3120 SmallVector<uint64_t, 16> Offsets; 3121 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 3122 if (VTs.empty()) 3123 report_fatal_error("Empty parameter types are not supported"); 3124 3125 auto VectorInfo = 3126 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 3127 3128 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3129 int VecIdx = -1; // Index of the first element of the current vector. 3130 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 3131 if (VectorInfo[parti] & PVF_FIRST) { 3132 assert(VecIdx == -1 && "Orphaned vector."); 3133 VecIdx = parti; 3134 } 3135 3136 // That's the last element of this store op. 3137 if (VectorInfo[parti] & PVF_LAST) { 3138 unsigned NumElts = parti - VecIdx + 1; 3139 EVT EltVT = VTs[parti]; 3140 // i1 is loaded/stored as i8. 3141 EVT LoadVT = EltVT; 3142 if (EltVT == MVT::i1) 3143 LoadVT = MVT::i8; 3144 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) 3145 // getLoad needs a vector type, but it can't handle 3146 // vectors which contain v2f16 or v2bf16 elements. So we must load 3147 // using i32 here and then bitcast back. 3148 LoadVT = MVT::i32; 3149 3150 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 3151 SDValue VecAddr = 3152 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 3153 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 3154 Value *srcValue = Constant::getNullValue(PointerType::get( 3155 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 3156 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 3157 MachinePointerInfo(srcValue), 3158 MaybeAlign(aggregateIsPacked ? 1 : 0), 3159 MachineMemOperand::MODereferenceable | 3160 MachineMemOperand::MOInvariant); 3161 if (P.getNode()) 3162 P.getNode()->setIROrder(idx + 1); 3163 for (unsigned j = 0; j < NumElts; ++j) { 3164 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 3165 DAG.getIntPtrConstant(j, dl)); 3166 // We've loaded i1 as an i8 and now must truncate it back to i1 3167 if (EltVT == MVT::i1) 3168 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 3169 // v2f16 was loaded as an i32. Now we must bitcast it back. 3170 else if (EltVT != LoadVT) 3171 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 3172 3173 // If a promoted integer type is used, truncate down to the original 3174 MVT PromotedVT; 3175 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 3176 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 3177 } 3178 3179 // Extend the element if necessary (e.g. an i8 is loaded 3180 // into an i16 register) 3181 if (Ins[InsIdx].VT.isInteger() && 3182 Ins[InsIdx].VT.getFixedSizeInBits() > 3183 LoadVT.getFixedSizeInBits()) { 3184 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 3185 : ISD::ZERO_EXTEND; 3186 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 3187 } 3188 InVals.push_back(Elt); 3189 } 3190 3191 // Reset vector tracking state. 3192 VecIdx = -1; 3193 } 3194 ++InsIdx; 3195 } 3196 if (VTs.size() > 0) 3197 --InsIdx; 3198 continue; 3199 } 3200 3201 // Param has ByVal attribute 3202 // Return MoveParam(param symbol). 3203 // Ideally, the param symbol can be returned directly, 3204 // but when SDNode builder decides to use it in a CopyToReg(), 3205 // machine instruction fails because TargetExternalSymbol 3206 // (not lowered) is target dependent, and CopyToReg assumes 3207 // the source is lowered. 3208 EVT ObjectVT = getValueType(DL, Ty); 3209 assert(ObjectVT == Ins[InsIdx].VT && 3210 "Ins type did not match function type"); 3211 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 3212 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 3213 if (p.getNode()) 3214 p.getNode()->setIROrder(idx + 1); 3215 InVals.push_back(p); 3216 } 3217 3218 if (!OutChains.empty()) 3219 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 3220 3221 return Chain; 3222 } 3223 3224 SDValue 3225 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3226 bool isVarArg, 3227 const SmallVectorImpl<ISD::OutputArg> &Outs, 3228 const SmallVectorImpl<SDValue> &OutVals, 3229 const SDLoc &dl, SelectionDAG &DAG) const { 3230 const MachineFunction &MF = DAG.getMachineFunction(); 3231 const Function &F = MF.getFunction(); 3232 Type *RetTy = MF.getFunction().getReturnType(); 3233 3234 bool isABI = (STI.getSmVersion() >= 20); 3235 assert(isABI && "Non-ABI compilation is not supported"); 3236 if (!isABI) 3237 return Chain; 3238 3239 const DataLayout &DL = DAG.getDataLayout(); 3240 SmallVector<SDValue, 16> PromotedOutVals; 3241 SmallVector<EVT, 16> VTs; 3242 SmallVector<uint64_t, 16> Offsets; 3243 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 3244 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 3245 3246 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3247 SDValue PromotedOutVal = OutVals[i]; 3248 MVT PromotedVT; 3249 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 3250 VTs[i] = EVT(PromotedVT); 3251 } 3252 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 3253 llvm::ISD::NodeType Ext = 3254 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3255 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 3256 } 3257 PromotedOutVals.push_back(PromotedOutVal); 3258 } 3259 3260 auto VectorInfo = VectorizePTXValueVTs( 3261 VTs, Offsets, 3262 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 3263 : Align(1)); 3264 3265 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3266 // 32-bits are sign extended or zero extended, depending on whether 3267 // they are signed or unsigned types. 3268 bool ExtendIntegerRetVal = 3269 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 3270 3271 SmallVector<SDValue, 6> StoreOperands; 3272 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3273 // New load/store. Record chain and offset operands. 3274 if (VectorInfo[i] & PVF_FIRST) { 3275 assert(StoreOperands.empty() && "Orphaned operand list."); 3276 StoreOperands.push_back(Chain); 3277 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 3278 } 3279 3280 SDValue OutVal = OutVals[i]; 3281 SDValue RetVal = PromotedOutVals[i]; 3282 3283 if (ExtendIntegerRetVal) { 3284 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 3285 : ISD::ZERO_EXTEND, 3286 dl, MVT::i32, RetVal); 3287 } else if (OutVal.getValueSizeInBits() < 16) { 3288 // Use 16-bit registers for small load-stores as it's the 3289 // smallest general purpose register size supported by NVPTX. 3290 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 3291 } 3292 3293 // Record the value to return. 3294 StoreOperands.push_back(RetVal); 3295 3296 // That's the last element of this store op. 3297 if (VectorInfo[i] & PVF_LAST) { 3298 NVPTXISD::NodeType Op; 3299 unsigned NumElts = StoreOperands.size() - 2; 3300 switch (NumElts) { 3301 case 1: 3302 Op = NVPTXISD::StoreRetval; 3303 break; 3304 case 2: 3305 Op = NVPTXISD::StoreRetvalV2; 3306 break; 3307 case 4: 3308 Op = NVPTXISD::StoreRetvalV4; 3309 break; 3310 default: 3311 llvm_unreachable("Invalid vector info."); 3312 } 3313 3314 // Adjust type of load/store op if we've extended the scalar 3315 // return value. 3316 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3317 Chain = DAG.getMemIntrinsicNode( 3318 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 3319 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 3320 // Cleanup vector state. 3321 StoreOperands.clear(); 3322 } 3323 } 3324 3325 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 3326 } 3327 3328 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 3329 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 3330 SelectionDAG &DAG) const { 3331 if (Constraint.size() > 1) 3332 return; 3333 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3334 } 3335 3336 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 3337 switch (Intrinsic) { 3338 default: 3339 return 0; 3340 3341 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3342 return NVPTXISD::Tex1DFloatS32; 3343 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3344 return NVPTXISD::Tex1DFloatFloat; 3345 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3346 return NVPTXISD::Tex1DFloatFloatLevel; 3347 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3348 return NVPTXISD::Tex1DFloatFloatGrad; 3349 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3350 return NVPTXISD::Tex1DS32S32; 3351 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3352 return NVPTXISD::Tex1DS32Float; 3353 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3354 return NVPTXISD::Tex1DS32FloatLevel; 3355 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3356 return NVPTXISD::Tex1DS32FloatGrad; 3357 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3358 return NVPTXISD::Tex1DU32S32; 3359 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3360 return NVPTXISD::Tex1DU32Float; 3361 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3362 return NVPTXISD::Tex1DU32FloatLevel; 3363 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3364 return NVPTXISD::Tex1DU32FloatGrad; 3365 3366 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3367 return NVPTXISD::Tex1DArrayFloatS32; 3368 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3369 return NVPTXISD::Tex1DArrayFloatFloat; 3370 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3371 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3372 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3373 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3374 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3375 return NVPTXISD::Tex1DArrayS32S32; 3376 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3377 return NVPTXISD::Tex1DArrayS32Float; 3378 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3379 return NVPTXISD::Tex1DArrayS32FloatLevel; 3380 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3381 return NVPTXISD::Tex1DArrayS32FloatGrad; 3382 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3383 return NVPTXISD::Tex1DArrayU32S32; 3384 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3385 return NVPTXISD::Tex1DArrayU32Float; 3386 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3387 return NVPTXISD::Tex1DArrayU32FloatLevel; 3388 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3389 return NVPTXISD::Tex1DArrayU32FloatGrad; 3390 3391 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3392 return NVPTXISD::Tex2DFloatS32; 3393 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3394 return NVPTXISD::Tex2DFloatFloat; 3395 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3396 return NVPTXISD::Tex2DFloatFloatLevel; 3397 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3398 return NVPTXISD::Tex2DFloatFloatGrad; 3399 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3400 return NVPTXISD::Tex2DS32S32; 3401 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3402 return NVPTXISD::Tex2DS32Float; 3403 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3404 return NVPTXISD::Tex2DS32FloatLevel; 3405 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3406 return NVPTXISD::Tex2DS32FloatGrad; 3407 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3408 return NVPTXISD::Tex2DU32S32; 3409 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3410 return NVPTXISD::Tex2DU32Float; 3411 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3412 return NVPTXISD::Tex2DU32FloatLevel; 3413 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3414 return NVPTXISD::Tex2DU32FloatGrad; 3415 3416 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3417 return NVPTXISD::Tex2DArrayFloatS32; 3418 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3419 return NVPTXISD::Tex2DArrayFloatFloat; 3420 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3421 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3422 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3423 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3424 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3425 return NVPTXISD::Tex2DArrayS32S32; 3426 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3427 return NVPTXISD::Tex2DArrayS32Float; 3428 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3429 return NVPTXISD::Tex2DArrayS32FloatLevel; 3430 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3431 return NVPTXISD::Tex2DArrayS32FloatGrad; 3432 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3433 return NVPTXISD::Tex2DArrayU32S32; 3434 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3435 return NVPTXISD::Tex2DArrayU32Float; 3436 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3437 return NVPTXISD::Tex2DArrayU32FloatLevel; 3438 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3439 return NVPTXISD::Tex2DArrayU32FloatGrad; 3440 3441 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3442 return NVPTXISD::Tex3DFloatS32; 3443 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3444 return NVPTXISD::Tex3DFloatFloat; 3445 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3446 return NVPTXISD::Tex3DFloatFloatLevel; 3447 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3448 return NVPTXISD::Tex3DFloatFloatGrad; 3449 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3450 return NVPTXISD::Tex3DS32S32; 3451 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3452 return NVPTXISD::Tex3DS32Float; 3453 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3454 return NVPTXISD::Tex3DS32FloatLevel; 3455 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3456 return NVPTXISD::Tex3DS32FloatGrad; 3457 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3458 return NVPTXISD::Tex3DU32S32; 3459 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3460 return NVPTXISD::Tex3DU32Float; 3461 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3462 return NVPTXISD::Tex3DU32FloatLevel; 3463 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3464 return NVPTXISD::Tex3DU32FloatGrad; 3465 3466 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3467 return NVPTXISD::TexCubeFloatFloat; 3468 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3469 return NVPTXISD::TexCubeFloatFloatLevel; 3470 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3471 return NVPTXISD::TexCubeS32Float; 3472 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3473 return NVPTXISD::TexCubeS32FloatLevel; 3474 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3475 return NVPTXISD::TexCubeU32Float; 3476 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3477 return NVPTXISD::TexCubeU32FloatLevel; 3478 3479 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3480 return NVPTXISD::TexCubeArrayFloatFloat; 3481 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3482 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3483 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3484 return NVPTXISD::TexCubeArrayS32Float; 3485 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3486 return NVPTXISD::TexCubeArrayS32FloatLevel; 3487 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3488 return NVPTXISD::TexCubeArrayU32Float; 3489 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3490 return NVPTXISD::TexCubeArrayU32FloatLevel; 3491 3492 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3493 return NVPTXISD::Tld4R2DFloatFloat; 3494 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3495 return NVPTXISD::Tld4G2DFloatFloat; 3496 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3497 return NVPTXISD::Tld4B2DFloatFloat; 3498 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3499 return NVPTXISD::Tld4A2DFloatFloat; 3500 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3501 return NVPTXISD::Tld4R2DS64Float; 3502 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3503 return NVPTXISD::Tld4G2DS64Float; 3504 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3505 return NVPTXISD::Tld4B2DS64Float; 3506 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3507 return NVPTXISD::Tld4A2DS64Float; 3508 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3509 return NVPTXISD::Tld4R2DU64Float; 3510 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3511 return NVPTXISD::Tld4G2DU64Float; 3512 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3513 return NVPTXISD::Tld4B2DU64Float; 3514 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3515 return NVPTXISD::Tld4A2DU64Float; 3516 3517 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3518 return NVPTXISD::TexUnified1DFloatS32; 3519 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3520 return NVPTXISD::TexUnified1DFloatFloat; 3521 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3522 return NVPTXISD::TexUnified1DFloatFloatLevel; 3523 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3524 return NVPTXISD::TexUnified1DFloatFloatGrad; 3525 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3526 return NVPTXISD::TexUnified1DS32S32; 3527 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3528 return NVPTXISD::TexUnified1DS32Float; 3529 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3530 return NVPTXISD::TexUnified1DS32FloatLevel; 3531 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3532 return NVPTXISD::TexUnified1DS32FloatGrad; 3533 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3534 return NVPTXISD::TexUnified1DU32S32; 3535 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3536 return NVPTXISD::TexUnified1DU32Float; 3537 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3538 return NVPTXISD::TexUnified1DU32FloatLevel; 3539 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3540 return NVPTXISD::TexUnified1DU32FloatGrad; 3541 3542 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3543 return NVPTXISD::TexUnified1DArrayFloatS32; 3544 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3545 return NVPTXISD::TexUnified1DArrayFloatFloat; 3546 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3547 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3548 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3549 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3550 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3551 return NVPTXISD::TexUnified1DArrayS32S32; 3552 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3553 return NVPTXISD::TexUnified1DArrayS32Float; 3554 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3555 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3556 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3557 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3558 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3559 return NVPTXISD::TexUnified1DArrayU32S32; 3560 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3561 return NVPTXISD::TexUnified1DArrayU32Float; 3562 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3563 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3564 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3565 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3566 3567 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3568 return NVPTXISD::TexUnified2DFloatS32; 3569 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3570 return NVPTXISD::TexUnified2DFloatFloat; 3571 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3572 return NVPTXISD::TexUnified2DFloatFloatLevel; 3573 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3574 return NVPTXISD::TexUnified2DFloatFloatGrad; 3575 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3576 return NVPTXISD::TexUnified2DS32S32; 3577 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3578 return NVPTXISD::TexUnified2DS32Float; 3579 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3580 return NVPTXISD::TexUnified2DS32FloatLevel; 3581 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3582 return NVPTXISD::TexUnified2DS32FloatGrad; 3583 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3584 return NVPTXISD::TexUnified2DU32S32; 3585 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3586 return NVPTXISD::TexUnified2DU32Float; 3587 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3588 return NVPTXISD::TexUnified2DU32FloatLevel; 3589 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3590 return NVPTXISD::TexUnified2DU32FloatGrad; 3591 3592 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3593 return NVPTXISD::TexUnified2DArrayFloatS32; 3594 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3595 return NVPTXISD::TexUnified2DArrayFloatFloat; 3596 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3597 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3598 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3599 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3600 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3601 return NVPTXISD::TexUnified2DArrayS32S32; 3602 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3603 return NVPTXISD::TexUnified2DArrayS32Float; 3604 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3605 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3606 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3607 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3608 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3609 return NVPTXISD::TexUnified2DArrayU32S32; 3610 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3611 return NVPTXISD::TexUnified2DArrayU32Float; 3612 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3613 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3614 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3615 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3616 3617 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3618 return NVPTXISD::TexUnified3DFloatS32; 3619 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3620 return NVPTXISD::TexUnified3DFloatFloat; 3621 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3622 return NVPTXISD::TexUnified3DFloatFloatLevel; 3623 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3624 return NVPTXISD::TexUnified3DFloatFloatGrad; 3625 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3626 return NVPTXISD::TexUnified3DS32S32; 3627 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3628 return NVPTXISD::TexUnified3DS32Float; 3629 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3630 return NVPTXISD::TexUnified3DS32FloatLevel; 3631 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3632 return NVPTXISD::TexUnified3DS32FloatGrad; 3633 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3634 return NVPTXISD::TexUnified3DU32S32; 3635 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3636 return NVPTXISD::TexUnified3DU32Float; 3637 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3638 return NVPTXISD::TexUnified3DU32FloatLevel; 3639 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3640 return NVPTXISD::TexUnified3DU32FloatGrad; 3641 3642 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3643 return NVPTXISD::TexUnifiedCubeFloatFloat; 3644 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3645 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3646 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3647 return NVPTXISD::TexUnifiedCubeS32Float; 3648 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3649 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3650 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3651 return NVPTXISD::TexUnifiedCubeU32Float; 3652 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3653 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3654 3655 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3656 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3657 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3658 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3659 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3660 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3661 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3662 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3663 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3664 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3665 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3666 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3667 3668 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 3669 return NVPTXISD::TexUnifiedCubeFloatFloatGrad; 3670 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 3671 return NVPTXISD::TexUnifiedCubeS32FloatGrad; 3672 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 3673 return NVPTXISD::TexUnifiedCubeU32FloatGrad; 3674 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 3675 return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad; 3676 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 3677 return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad; 3678 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 3679 return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad; 3680 3681 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3682 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3683 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3684 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3685 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3686 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3687 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3688 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3689 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3690 return NVPTXISD::Tld4UnifiedR2DS64Float; 3691 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3692 return NVPTXISD::Tld4UnifiedG2DS64Float; 3693 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3694 return NVPTXISD::Tld4UnifiedB2DS64Float; 3695 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3696 return NVPTXISD::Tld4UnifiedA2DS64Float; 3697 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3698 return NVPTXISD::Tld4UnifiedR2DU64Float; 3699 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3700 return NVPTXISD::Tld4UnifiedG2DU64Float; 3701 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3702 return NVPTXISD::Tld4UnifiedB2DU64Float; 3703 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3704 return NVPTXISD::Tld4UnifiedA2DU64Float; 3705 } 3706 } 3707 3708 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3709 switch (Intrinsic) { 3710 default: 3711 return 0; 3712 case Intrinsic::nvvm_suld_1d_i8_clamp: 3713 return NVPTXISD::Suld1DI8Clamp; 3714 case Intrinsic::nvvm_suld_1d_i16_clamp: 3715 return NVPTXISD::Suld1DI16Clamp; 3716 case Intrinsic::nvvm_suld_1d_i32_clamp: 3717 return NVPTXISD::Suld1DI32Clamp; 3718 case Intrinsic::nvvm_suld_1d_i64_clamp: 3719 return NVPTXISD::Suld1DI64Clamp; 3720 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3721 return NVPTXISD::Suld1DV2I8Clamp; 3722 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3723 return NVPTXISD::Suld1DV2I16Clamp; 3724 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3725 return NVPTXISD::Suld1DV2I32Clamp; 3726 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3727 return NVPTXISD::Suld1DV2I64Clamp; 3728 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3729 return NVPTXISD::Suld1DV4I8Clamp; 3730 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3731 return NVPTXISD::Suld1DV4I16Clamp; 3732 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3733 return NVPTXISD::Suld1DV4I32Clamp; 3734 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3735 return NVPTXISD::Suld1DArrayI8Clamp; 3736 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3737 return NVPTXISD::Suld1DArrayI16Clamp; 3738 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3739 return NVPTXISD::Suld1DArrayI32Clamp; 3740 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3741 return NVPTXISD::Suld1DArrayI64Clamp; 3742 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3743 return NVPTXISD::Suld1DArrayV2I8Clamp; 3744 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3745 return NVPTXISD::Suld1DArrayV2I16Clamp; 3746 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3747 return NVPTXISD::Suld1DArrayV2I32Clamp; 3748 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3749 return NVPTXISD::Suld1DArrayV2I64Clamp; 3750 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3751 return NVPTXISD::Suld1DArrayV4I8Clamp; 3752 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3753 return NVPTXISD::Suld1DArrayV4I16Clamp; 3754 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3755 return NVPTXISD::Suld1DArrayV4I32Clamp; 3756 case Intrinsic::nvvm_suld_2d_i8_clamp: 3757 return NVPTXISD::Suld2DI8Clamp; 3758 case Intrinsic::nvvm_suld_2d_i16_clamp: 3759 return NVPTXISD::Suld2DI16Clamp; 3760 case Intrinsic::nvvm_suld_2d_i32_clamp: 3761 return NVPTXISD::Suld2DI32Clamp; 3762 case Intrinsic::nvvm_suld_2d_i64_clamp: 3763 return NVPTXISD::Suld2DI64Clamp; 3764 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3765 return NVPTXISD::Suld2DV2I8Clamp; 3766 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3767 return NVPTXISD::Suld2DV2I16Clamp; 3768 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3769 return NVPTXISD::Suld2DV2I32Clamp; 3770 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3771 return NVPTXISD::Suld2DV2I64Clamp; 3772 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3773 return NVPTXISD::Suld2DV4I8Clamp; 3774 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3775 return NVPTXISD::Suld2DV4I16Clamp; 3776 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3777 return NVPTXISD::Suld2DV4I32Clamp; 3778 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3779 return NVPTXISD::Suld2DArrayI8Clamp; 3780 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3781 return NVPTXISD::Suld2DArrayI16Clamp; 3782 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3783 return NVPTXISD::Suld2DArrayI32Clamp; 3784 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3785 return NVPTXISD::Suld2DArrayI64Clamp; 3786 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3787 return NVPTXISD::Suld2DArrayV2I8Clamp; 3788 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3789 return NVPTXISD::Suld2DArrayV2I16Clamp; 3790 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3791 return NVPTXISD::Suld2DArrayV2I32Clamp; 3792 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3793 return NVPTXISD::Suld2DArrayV2I64Clamp; 3794 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3795 return NVPTXISD::Suld2DArrayV4I8Clamp; 3796 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3797 return NVPTXISD::Suld2DArrayV4I16Clamp; 3798 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3799 return NVPTXISD::Suld2DArrayV4I32Clamp; 3800 case Intrinsic::nvvm_suld_3d_i8_clamp: 3801 return NVPTXISD::Suld3DI8Clamp; 3802 case Intrinsic::nvvm_suld_3d_i16_clamp: 3803 return NVPTXISD::Suld3DI16Clamp; 3804 case Intrinsic::nvvm_suld_3d_i32_clamp: 3805 return NVPTXISD::Suld3DI32Clamp; 3806 case Intrinsic::nvvm_suld_3d_i64_clamp: 3807 return NVPTXISD::Suld3DI64Clamp; 3808 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3809 return NVPTXISD::Suld3DV2I8Clamp; 3810 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3811 return NVPTXISD::Suld3DV2I16Clamp; 3812 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3813 return NVPTXISD::Suld3DV2I32Clamp; 3814 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3815 return NVPTXISD::Suld3DV2I64Clamp; 3816 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3817 return NVPTXISD::Suld3DV4I8Clamp; 3818 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3819 return NVPTXISD::Suld3DV4I16Clamp; 3820 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3821 return NVPTXISD::Suld3DV4I32Clamp; 3822 case Intrinsic::nvvm_suld_1d_i8_trap: 3823 return NVPTXISD::Suld1DI8Trap; 3824 case Intrinsic::nvvm_suld_1d_i16_trap: 3825 return NVPTXISD::Suld1DI16Trap; 3826 case Intrinsic::nvvm_suld_1d_i32_trap: 3827 return NVPTXISD::Suld1DI32Trap; 3828 case Intrinsic::nvvm_suld_1d_i64_trap: 3829 return NVPTXISD::Suld1DI64Trap; 3830 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3831 return NVPTXISD::Suld1DV2I8Trap; 3832 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3833 return NVPTXISD::Suld1DV2I16Trap; 3834 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3835 return NVPTXISD::Suld1DV2I32Trap; 3836 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3837 return NVPTXISD::Suld1DV2I64Trap; 3838 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3839 return NVPTXISD::Suld1DV4I8Trap; 3840 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3841 return NVPTXISD::Suld1DV4I16Trap; 3842 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3843 return NVPTXISD::Suld1DV4I32Trap; 3844 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3845 return NVPTXISD::Suld1DArrayI8Trap; 3846 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3847 return NVPTXISD::Suld1DArrayI16Trap; 3848 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3849 return NVPTXISD::Suld1DArrayI32Trap; 3850 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3851 return NVPTXISD::Suld1DArrayI64Trap; 3852 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3853 return NVPTXISD::Suld1DArrayV2I8Trap; 3854 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3855 return NVPTXISD::Suld1DArrayV2I16Trap; 3856 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3857 return NVPTXISD::Suld1DArrayV2I32Trap; 3858 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3859 return NVPTXISD::Suld1DArrayV2I64Trap; 3860 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3861 return NVPTXISD::Suld1DArrayV4I8Trap; 3862 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3863 return NVPTXISD::Suld1DArrayV4I16Trap; 3864 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3865 return NVPTXISD::Suld1DArrayV4I32Trap; 3866 case Intrinsic::nvvm_suld_2d_i8_trap: 3867 return NVPTXISD::Suld2DI8Trap; 3868 case Intrinsic::nvvm_suld_2d_i16_trap: 3869 return NVPTXISD::Suld2DI16Trap; 3870 case Intrinsic::nvvm_suld_2d_i32_trap: 3871 return NVPTXISD::Suld2DI32Trap; 3872 case Intrinsic::nvvm_suld_2d_i64_trap: 3873 return NVPTXISD::Suld2DI64Trap; 3874 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3875 return NVPTXISD::Suld2DV2I8Trap; 3876 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3877 return NVPTXISD::Suld2DV2I16Trap; 3878 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3879 return NVPTXISD::Suld2DV2I32Trap; 3880 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3881 return NVPTXISD::Suld2DV2I64Trap; 3882 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3883 return NVPTXISD::Suld2DV4I8Trap; 3884 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3885 return NVPTXISD::Suld2DV4I16Trap; 3886 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3887 return NVPTXISD::Suld2DV4I32Trap; 3888 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3889 return NVPTXISD::Suld2DArrayI8Trap; 3890 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3891 return NVPTXISD::Suld2DArrayI16Trap; 3892 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3893 return NVPTXISD::Suld2DArrayI32Trap; 3894 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3895 return NVPTXISD::Suld2DArrayI64Trap; 3896 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3897 return NVPTXISD::Suld2DArrayV2I8Trap; 3898 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3899 return NVPTXISD::Suld2DArrayV2I16Trap; 3900 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3901 return NVPTXISD::Suld2DArrayV2I32Trap; 3902 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3903 return NVPTXISD::Suld2DArrayV2I64Trap; 3904 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3905 return NVPTXISD::Suld2DArrayV4I8Trap; 3906 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3907 return NVPTXISD::Suld2DArrayV4I16Trap; 3908 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3909 return NVPTXISD::Suld2DArrayV4I32Trap; 3910 case Intrinsic::nvvm_suld_3d_i8_trap: 3911 return NVPTXISD::Suld3DI8Trap; 3912 case Intrinsic::nvvm_suld_3d_i16_trap: 3913 return NVPTXISD::Suld3DI16Trap; 3914 case Intrinsic::nvvm_suld_3d_i32_trap: 3915 return NVPTXISD::Suld3DI32Trap; 3916 case Intrinsic::nvvm_suld_3d_i64_trap: 3917 return NVPTXISD::Suld3DI64Trap; 3918 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3919 return NVPTXISD::Suld3DV2I8Trap; 3920 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3921 return NVPTXISD::Suld3DV2I16Trap; 3922 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3923 return NVPTXISD::Suld3DV2I32Trap; 3924 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3925 return NVPTXISD::Suld3DV2I64Trap; 3926 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3927 return NVPTXISD::Suld3DV4I8Trap; 3928 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3929 return NVPTXISD::Suld3DV4I16Trap; 3930 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3931 return NVPTXISD::Suld3DV4I32Trap; 3932 case Intrinsic::nvvm_suld_1d_i8_zero: 3933 return NVPTXISD::Suld1DI8Zero; 3934 case Intrinsic::nvvm_suld_1d_i16_zero: 3935 return NVPTXISD::Suld1DI16Zero; 3936 case Intrinsic::nvvm_suld_1d_i32_zero: 3937 return NVPTXISD::Suld1DI32Zero; 3938 case Intrinsic::nvvm_suld_1d_i64_zero: 3939 return NVPTXISD::Suld1DI64Zero; 3940 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3941 return NVPTXISD::Suld1DV2I8Zero; 3942 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3943 return NVPTXISD::Suld1DV2I16Zero; 3944 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3945 return NVPTXISD::Suld1DV2I32Zero; 3946 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3947 return NVPTXISD::Suld1DV2I64Zero; 3948 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3949 return NVPTXISD::Suld1DV4I8Zero; 3950 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3951 return NVPTXISD::Suld1DV4I16Zero; 3952 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3953 return NVPTXISD::Suld1DV4I32Zero; 3954 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3955 return NVPTXISD::Suld1DArrayI8Zero; 3956 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3957 return NVPTXISD::Suld1DArrayI16Zero; 3958 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3959 return NVPTXISD::Suld1DArrayI32Zero; 3960 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3961 return NVPTXISD::Suld1DArrayI64Zero; 3962 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3963 return NVPTXISD::Suld1DArrayV2I8Zero; 3964 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3965 return NVPTXISD::Suld1DArrayV2I16Zero; 3966 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3967 return NVPTXISD::Suld1DArrayV2I32Zero; 3968 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3969 return NVPTXISD::Suld1DArrayV2I64Zero; 3970 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3971 return NVPTXISD::Suld1DArrayV4I8Zero; 3972 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3973 return NVPTXISD::Suld1DArrayV4I16Zero; 3974 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3975 return NVPTXISD::Suld1DArrayV4I32Zero; 3976 case Intrinsic::nvvm_suld_2d_i8_zero: 3977 return NVPTXISD::Suld2DI8Zero; 3978 case Intrinsic::nvvm_suld_2d_i16_zero: 3979 return NVPTXISD::Suld2DI16Zero; 3980 case Intrinsic::nvvm_suld_2d_i32_zero: 3981 return NVPTXISD::Suld2DI32Zero; 3982 case Intrinsic::nvvm_suld_2d_i64_zero: 3983 return NVPTXISD::Suld2DI64Zero; 3984 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3985 return NVPTXISD::Suld2DV2I8Zero; 3986 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3987 return NVPTXISD::Suld2DV2I16Zero; 3988 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3989 return NVPTXISD::Suld2DV2I32Zero; 3990 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3991 return NVPTXISD::Suld2DV2I64Zero; 3992 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3993 return NVPTXISD::Suld2DV4I8Zero; 3994 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3995 return NVPTXISD::Suld2DV4I16Zero; 3996 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3997 return NVPTXISD::Suld2DV4I32Zero; 3998 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3999 return NVPTXISD::Suld2DArrayI8Zero; 4000 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4001 return NVPTXISD::Suld2DArrayI16Zero; 4002 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4003 return NVPTXISD::Suld2DArrayI32Zero; 4004 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4005 return NVPTXISD::Suld2DArrayI64Zero; 4006 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4007 return NVPTXISD::Suld2DArrayV2I8Zero; 4008 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4009 return NVPTXISD::Suld2DArrayV2I16Zero; 4010 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4011 return NVPTXISD::Suld2DArrayV2I32Zero; 4012 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4013 return NVPTXISD::Suld2DArrayV2I64Zero; 4014 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4015 return NVPTXISD::Suld2DArrayV4I8Zero; 4016 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4017 return NVPTXISD::Suld2DArrayV4I16Zero; 4018 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4019 return NVPTXISD::Suld2DArrayV4I32Zero; 4020 case Intrinsic::nvvm_suld_3d_i8_zero: 4021 return NVPTXISD::Suld3DI8Zero; 4022 case Intrinsic::nvvm_suld_3d_i16_zero: 4023 return NVPTXISD::Suld3DI16Zero; 4024 case Intrinsic::nvvm_suld_3d_i32_zero: 4025 return NVPTXISD::Suld3DI32Zero; 4026 case Intrinsic::nvvm_suld_3d_i64_zero: 4027 return NVPTXISD::Suld3DI64Zero; 4028 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4029 return NVPTXISD::Suld3DV2I8Zero; 4030 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4031 return NVPTXISD::Suld3DV2I16Zero; 4032 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4033 return NVPTXISD::Suld3DV2I32Zero; 4034 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4035 return NVPTXISD::Suld3DV2I64Zero; 4036 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4037 return NVPTXISD::Suld3DV4I8Zero; 4038 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4039 return NVPTXISD::Suld3DV4I16Zero; 4040 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4041 return NVPTXISD::Suld3DV4I32Zero; 4042 } 4043 } 4044 4045 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 4046 // TgtMemIntrinsic 4047 // because we need the information that is only available in the "Value" type 4048 // of destination 4049 // pointer. In particular, the address space information. 4050 bool NVPTXTargetLowering::getTgtMemIntrinsic( 4051 IntrinsicInfo &Info, const CallInst &I, 4052 MachineFunction &MF, unsigned Intrinsic) const { 4053 switch (Intrinsic) { 4054 default: 4055 return false; 4056 case Intrinsic::nvvm_match_all_sync_i32p: 4057 case Intrinsic::nvvm_match_all_sync_i64p: 4058 Info.opc = ISD::INTRINSIC_W_CHAIN; 4059 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 4060 // in order to model data exchange with other threads, but perform no real 4061 // memory accesses. 4062 Info.memVT = MVT::i1; 4063 4064 // Our result depends on both our and other thread's arguments. 4065 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4066 return true; 4067 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 4068 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 4069 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 4070 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 4071 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 4072 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 4073 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 4074 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 4075 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 4076 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 4077 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 4078 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 4079 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 4080 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 4081 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 4082 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 4083 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 4084 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 4085 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 4086 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 4087 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 4088 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 4089 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 4090 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 4091 Info.opc = ISD::INTRINSIC_W_CHAIN; 4092 Info.memVT = MVT::v8f16; 4093 Info.ptrVal = I.getArgOperand(0); 4094 Info.offset = 0; 4095 Info.flags = MachineMemOperand::MOLoad; 4096 Info.align = Align(16); 4097 return true; 4098 } 4099 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 4100 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 4101 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 4102 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 4103 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 4104 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 4105 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 4106 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 4107 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 4108 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 4109 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 4110 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 4111 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 4112 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 4113 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 4114 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 4115 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 4116 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 4117 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 4118 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 4119 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 4120 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 4121 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 4122 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 4123 Info.opc = ISD::INTRINSIC_W_CHAIN; 4124 Info.memVT = MVT::v2i32; 4125 Info.ptrVal = I.getArgOperand(0); 4126 Info.offset = 0; 4127 Info.flags = MachineMemOperand::MOLoad; 4128 Info.align = Align(8); 4129 return true; 4130 } 4131 4132 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 4133 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 4134 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 4135 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 4136 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 4137 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 4138 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 4139 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 4140 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 4141 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 4142 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 4143 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 4144 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 4145 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 4146 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 4147 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 4148 4149 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 4150 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 4151 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 4152 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 4153 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 4154 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 4155 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 4156 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 4157 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 4158 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 4159 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 4160 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 4161 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 4162 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 4163 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 4164 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 4165 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 4166 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 4167 Info.opc = ISD::INTRINSIC_W_CHAIN; 4168 Info.memVT = MVT::v4i32; 4169 Info.ptrVal = I.getArgOperand(0); 4170 Info.offset = 0; 4171 Info.flags = MachineMemOperand::MOLoad; 4172 Info.align = Align(16); 4173 return true; 4174 } 4175 4176 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 4177 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 4178 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 4179 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 4180 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 4181 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 4182 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 4183 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 4184 4185 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 4186 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 4187 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 4188 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 4189 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 4190 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 4191 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 4192 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 4193 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 4194 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 4195 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 4196 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 4197 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 4198 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 4199 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 4200 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 4201 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 4202 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 4203 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 4204 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 4205 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 4206 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 4207 Info.opc = ISD::INTRINSIC_W_CHAIN; 4208 Info.memVT = MVT::i32; 4209 Info.ptrVal = I.getArgOperand(0); 4210 Info.offset = 0; 4211 Info.flags = MachineMemOperand::MOLoad; 4212 Info.align = Align(4); 4213 return true; 4214 } 4215 4216 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 4217 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 4218 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 4219 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 4220 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 4221 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 4222 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 4223 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 4224 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 4225 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 4226 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 4227 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 4228 Info.opc = ISD::INTRINSIC_W_CHAIN; 4229 Info.memVT = MVT::v4f16; 4230 Info.ptrVal = I.getArgOperand(0); 4231 Info.offset = 0; 4232 Info.flags = MachineMemOperand::MOLoad; 4233 Info.align = Align(16); 4234 return true; 4235 } 4236 4237 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 4238 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 4239 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 4240 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 4241 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 4242 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 4243 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 4244 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 4245 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 4246 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 4247 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 4248 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 4249 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 4250 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 4251 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 4252 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 4253 Info.opc = ISD::INTRINSIC_W_CHAIN; 4254 Info.memVT = MVT::v8f32; 4255 Info.ptrVal = I.getArgOperand(0); 4256 Info.offset = 0; 4257 Info.flags = MachineMemOperand::MOLoad; 4258 Info.align = Align(16); 4259 return true; 4260 } 4261 4262 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 4263 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 4264 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 4265 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 4266 4267 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 4268 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 4269 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 4270 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 4271 4272 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 4273 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 4274 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 4275 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 4276 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 4277 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 4278 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 4279 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 4280 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 4281 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 4282 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 4283 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 4284 Info.opc = ISD::INTRINSIC_W_CHAIN; 4285 Info.memVT = MVT::v8i32; 4286 Info.ptrVal = I.getArgOperand(0); 4287 Info.offset = 0; 4288 Info.flags = MachineMemOperand::MOLoad; 4289 Info.align = Align(16); 4290 return true; 4291 } 4292 4293 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 4294 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 4295 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 4296 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 4297 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 4298 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 4299 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 4300 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 4301 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 4302 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 4303 Info.opc = ISD::INTRINSIC_W_CHAIN; 4304 Info.memVT = MVT::v2i32; 4305 Info.ptrVal = I.getArgOperand(0); 4306 Info.offset = 0; 4307 Info.flags = MachineMemOperand::MOLoad; 4308 Info.align = Align(8); 4309 return true; 4310 } 4311 4312 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 4313 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 4314 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 4315 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 4316 4317 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 4318 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 4319 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 4320 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 4321 Info.opc = ISD::INTRINSIC_W_CHAIN; 4322 Info.memVT = MVT::f64; 4323 Info.ptrVal = I.getArgOperand(0); 4324 Info.offset = 0; 4325 Info.flags = MachineMemOperand::MOLoad; 4326 Info.align = Align(8); 4327 return true; 4328 } 4329 4330 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 4331 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 4332 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 4333 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 4334 Info.opc = ISD::INTRINSIC_W_CHAIN; 4335 Info.memVT = MVT::v2f64; 4336 Info.ptrVal = I.getArgOperand(0); 4337 Info.offset = 0; 4338 Info.flags = MachineMemOperand::MOLoad; 4339 Info.align = Align(16); 4340 return true; 4341 } 4342 4343 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 4344 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 4345 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 4346 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 4347 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 4348 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 4349 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 4350 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 4351 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 4352 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 4353 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 4354 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 4355 Info.opc = ISD::INTRINSIC_VOID; 4356 Info.memVT = MVT::v4f16; 4357 Info.ptrVal = I.getArgOperand(0); 4358 Info.offset = 0; 4359 Info.flags = MachineMemOperand::MOStore; 4360 Info.align = Align(16); 4361 return true; 4362 } 4363 4364 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4365 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4366 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4367 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4368 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4369 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4370 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4371 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4372 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4373 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4374 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4375 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4376 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4377 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4378 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4379 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4380 Info.opc = ISD::INTRINSIC_VOID; 4381 Info.memVT = MVT::v8f32; 4382 Info.ptrVal = I.getArgOperand(0); 4383 Info.offset = 0; 4384 Info.flags = MachineMemOperand::MOStore; 4385 Info.align = Align(16); 4386 return true; 4387 } 4388 4389 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4390 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4391 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4392 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4393 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4394 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4395 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4396 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4397 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4398 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4399 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4400 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4401 Info.opc = ISD::INTRINSIC_VOID; 4402 Info.memVT = MVT::v8i32; 4403 Info.ptrVal = I.getArgOperand(0); 4404 Info.offset = 0; 4405 Info.flags = MachineMemOperand::MOStore; 4406 Info.align = Align(16); 4407 return true; 4408 } 4409 4410 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4411 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4412 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4413 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4414 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4415 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4416 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4417 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4418 Info.opc = ISD::INTRINSIC_VOID; 4419 Info.memVT = MVT::v2i32; 4420 Info.ptrVal = I.getArgOperand(0); 4421 Info.offset = 0; 4422 Info.flags = MachineMemOperand::MOStore; 4423 Info.align = Align(8); 4424 return true; 4425 } 4426 4427 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4428 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4429 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4430 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4431 Info.opc = ISD::INTRINSIC_VOID; 4432 Info.memVT = MVT::v2f64; 4433 Info.ptrVal = I.getArgOperand(0); 4434 Info.offset = 0; 4435 Info.flags = MachineMemOperand::MOStore; 4436 Info.align = Align(16); 4437 return true; 4438 } 4439 4440 case Intrinsic::nvvm_atomic_load_inc_32: 4441 case Intrinsic::nvvm_atomic_load_dec_32: 4442 4443 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4444 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4445 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4446 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4447 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4448 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4449 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4450 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4451 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4452 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4453 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4454 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4455 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4456 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4457 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4458 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4459 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4460 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4461 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4462 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4463 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4464 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4465 auto &DL = I.getModule()->getDataLayout(); 4466 Info.opc = ISD::INTRINSIC_W_CHAIN; 4467 Info.memVT = getValueType(DL, I.getType()); 4468 Info.ptrVal = I.getArgOperand(0); 4469 Info.offset = 0; 4470 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4471 Info.align.reset(); 4472 return true; 4473 } 4474 4475 case Intrinsic::nvvm_ldu_global_i: 4476 case Intrinsic::nvvm_ldu_global_f: 4477 case Intrinsic::nvvm_ldu_global_p: { 4478 auto &DL = I.getModule()->getDataLayout(); 4479 Info.opc = ISD::INTRINSIC_W_CHAIN; 4480 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4481 Info.memVT = getValueType(DL, I.getType()); 4482 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4483 Info.memVT = getPointerTy(DL); 4484 else 4485 Info.memVT = getValueType(DL, I.getType()); 4486 Info.ptrVal = I.getArgOperand(0); 4487 Info.offset = 0; 4488 Info.flags = MachineMemOperand::MOLoad; 4489 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4490 4491 return true; 4492 } 4493 case Intrinsic::nvvm_ldg_global_i: 4494 case Intrinsic::nvvm_ldg_global_f: 4495 case Intrinsic::nvvm_ldg_global_p: { 4496 auto &DL = I.getModule()->getDataLayout(); 4497 4498 Info.opc = ISD::INTRINSIC_W_CHAIN; 4499 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4500 Info.memVT = getValueType(DL, I.getType()); 4501 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4502 Info.memVT = getPointerTy(DL); 4503 else 4504 Info.memVT = getValueType(DL, I.getType()); 4505 Info.ptrVal = I.getArgOperand(0); 4506 Info.offset = 0; 4507 Info.flags = MachineMemOperand::MOLoad; 4508 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4509 4510 return true; 4511 } 4512 4513 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4514 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4515 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4516 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4517 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4518 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4519 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4520 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4521 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4522 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4523 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4524 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4525 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4526 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4527 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4528 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4529 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4530 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4531 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4532 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4533 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4534 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4535 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4536 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4537 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4538 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4539 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4540 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4541 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4542 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4543 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4544 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4545 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4546 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4547 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4548 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4549 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4550 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4551 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4552 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4553 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4554 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4555 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4556 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4557 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4558 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4559 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4560 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4561 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4562 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4563 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4564 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4565 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 4566 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 4567 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4568 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4569 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4570 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4571 Info.opc = getOpcForTextureInstr(Intrinsic); 4572 Info.memVT = MVT::v4f32; 4573 Info.ptrVal = nullptr; 4574 Info.offset = 0; 4575 Info.flags = MachineMemOperand::MOLoad; 4576 Info.align = Align(16); 4577 return true; 4578 4579 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4580 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4581 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4582 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4583 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4584 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4585 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4586 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4587 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4588 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4589 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4590 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4591 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4592 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4593 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4594 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4595 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4596 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4597 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4598 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4599 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4600 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4601 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4602 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4603 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4604 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4605 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4606 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4607 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4608 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4609 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4610 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4611 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4612 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4613 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4614 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4615 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4616 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4617 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4618 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4619 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4620 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4621 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4622 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4623 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4624 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4625 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4626 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4627 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4628 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4629 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4630 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4631 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4632 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4633 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4634 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4635 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4636 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4637 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4638 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4639 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4640 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4641 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4642 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4643 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4644 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4645 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4646 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4647 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4648 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4649 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4650 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4651 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4652 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4653 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4654 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4655 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4656 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4657 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4658 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4659 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4660 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4661 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4662 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4663 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4664 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4665 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4666 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4667 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4668 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4669 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4670 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4671 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4672 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4673 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4674 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4675 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4676 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4677 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4678 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4679 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4680 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4681 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4682 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4683 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 4684 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 4685 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 4686 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 4687 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4688 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4689 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4690 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4691 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4692 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4693 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4694 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4695 Info.opc = getOpcForTextureInstr(Intrinsic); 4696 Info.memVT = MVT::v4i32; 4697 Info.ptrVal = nullptr; 4698 Info.offset = 0; 4699 Info.flags = MachineMemOperand::MOLoad; 4700 Info.align = Align(16); 4701 return true; 4702 4703 case Intrinsic::nvvm_suld_1d_i8_clamp: 4704 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4705 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4706 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4707 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4708 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4709 case Intrinsic::nvvm_suld_2d_i8_clamp: 4710 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4711 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4712 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4713 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4714 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4715 case Intrinsic::nvvm_suld_3d_i8_clamp: 4716 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4717 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4718 case Intrinsic::nvvm_suld_1d_i8_trap: 4719 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4720 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4721 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4722 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4723 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4724 case Intrinsic::nvvm_suld_2d_i8_trap: 4725 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4726 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4727 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4728 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4729 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4730 case Intrinsic::nvvm_suld_3d_i8_trap: 4731 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4732 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4733 case Intrinsic::nvvm_suld_1d_i8_zero: 4734 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4735 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4736 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4737 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4738 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4739 case Intrinsic::nvvm_suld_2d_i8_zero: 4740 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4741 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4742 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4743 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4744 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4745 case Intrinsic::nvvm_suld_3d_i8_zero: 4746 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4747 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4748 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4749 Info.memVT = MVT::i8; 4750 Info.ptrVal = nullptr; 4751 Info.offset = 0; 4752 Info.flags = MachineMemOperand::MOLoad; 4753 Info.align = Align(16); 4754 return true; 4755 4756 case Intrinsic::nvvm_suld_1d_i16_clamp: 4757 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4758 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4759 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4760 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4761 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4762 case Intrinsic::nvvm_suld_2d_i16_clamp: 4763 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4764 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4765 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4766 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4767 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4768 case Intrinsic::nvvm_suld_3d_i16_clamp: 4769 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4770 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4771 case Intrinsic::nvvm_suld_1d_i16_trap: 4772 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4773 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4774 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4775 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4776 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4777 case Intrinsic::nvvm_suld_2d_i16_trap: 4778 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4779 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4780 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4781 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4782 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4783 case Intrinsic::nvvm_suld_3d_i16_trap: 4784 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4785 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4786 case Intrinsic::nvvm_suld_1d_i16_zero: 4787 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4788 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4789 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4790 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4791 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4792 case Intrinsic::nvvm_suld_2d_i16_zero: 4793 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4794 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4795 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4796 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4797 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4798 case Intrinsic::nvvm_suld_3d_i16_zero: 4799 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4800 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4801 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4802 Info.memVT = MVT::i16; 4803 Info.ptrVal = nullptr; 4804 Info.offset = 0; 4805 Info.flags = MachineMemOperand::MOLoad; 4806 Info.align = Align(16); 4807 return true; 4808 4809 case Intrinsic::nvvm_suld_1d_i32_clamp: 4810 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4811 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4812 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4813 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4814 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4815 case Intrinsic::nvvm_suld_2d_i32_clamp: 4816 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4817 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4818 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4819 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4820 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4821 case Intrinsic::nvvm_suld_3d_i32_clamp: 4822 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4823 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4824 case Intrinsic::nvvm_suld_1d_i32_trap: 4825 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4826 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4827 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4828 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4829 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4830 case Intrinsic::nvvm_suld_2d_i32_trap: 4831 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4832 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4833 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4834 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4835 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4836 case Intrinsic::nvvm_suld_3d_i32_trap: 4837 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4838 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4839 case Intrinsic::nvvm_suld_1d_i32_zero: 4840 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4841 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4842 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4843 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4844 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4845 case Intrinsic::nvvm_suld_2d_i32_zero: 4846 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4847 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4848 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4849 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4850 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4851 case Intrinsic::nvvm_suld_3d_i32_zero: 4852 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4853 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4854 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4855 Info.memVT = MVT::i32; 4856 Info.ptrVal = nullptr; 4857 Info.offset = 0; 4858 Info.flags = MachineMemOperand::MOLoad; 4859 Info.align = Align(16); 4860 return true; 4861 4862 case Intrinsic::nvvm_suld_1d_i64_clamp: 4863 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4864 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4865 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4866 case Intrinsic::nvvm_suld_2d_i64_clamp: 4867 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4868 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4869 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4870 case Intrinsic::nvvm_suld_3d_i64_clamp: 4871 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4872 case Intrinsic::nvvm_suld_1d_i64_trap: 4873 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4874 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4875 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4876 case Intrinsic::nvvm_suld_2d_i64_trap: 4877 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4878 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4879 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4880 case Intrinsic::nvvm_suld_3d_i64_trap: 4881 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4882 case Intrinsic::nvvm_suld_1d_i64_zero: 4883 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4884 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4885 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4886 case Intrinsic::nvvm_suld_2d_i64_zero: 4887 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4888 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4889 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4890 case Intrinsic::nvvm_suld_3d_i64_zero: 4891 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4892 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4893 Info.memVT = MVT::i64; 4894 Info.ptrVal = nullptr; 4895 Info.offset = 0; 4896 Info.flags = MachineMemOperand::MOLoad; 4897 Info.align = Align(16); 4898 return true; 4899 } 4900 return false; 4901 } 4902 4903 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4904 /// .param space, we may want to increase their alignment in a way that 4905 /// ensures that we can effectively vectorize their loads & stores. We can 4906 /// increase alignment only if the function has internal or has private 4907 /// linkage as for other linkage types callers may already rely on default 4908 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4909 /// ensures that alignment is 16 or greater. 4910 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4911 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4912 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4913 4914 // If a function has linkage different from internal or private, we 4915 // must use default ABI alignment as external users rely on it. Same 4916 // for a function that may be called from a function pointer. 4917 if (!F || !F->hasLocalLinkage() || 4918 F->hasAddressTaken(/*Users=*/nullptr, 4919 /*IgnoreCallbackUses=*/false, 4920 /*IgnoreAssumeLikeCalls=*/true, 4921 /*IgnoreLLVMUsed=*/true)) 4922 return Align(ABITypeAlign); 4923 4924 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4925 return Align(std::max(uint64_t(16), ABITypeAlign)); 4926 } 4927 4928 /// Helper for computing alignment of a device function byval parameter. 4929 Align NVPTXTargetLowering::getFunctionByValParamAlign( 4930 const Function *F, Type *ArgTy, Align InitialAlign, 4931 const DataLayout &DL) const { 4932 Align ArgAlign = InitialAlign; 4933 // Try to increase alignment to enhance vectorization options. 4934 if (F) 4935 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 4936 4937 // Old ptx versions have a bug. When PTX code takes address of 4938 // byval parameter with alignment < 4, ptxas generates code to 4939 // spill argument into memory. Alas on sm_50+ ptxas generates 4940 // SASS code that fails with misaligned access. To work around 4941 // the problem, make sure that we align byval parameters by at 4942 // least 4. This bug seems to be fixed at least starting from 4943 // ptxas > 9.0. 4944 // TODO: remove this after verifying the bug is not reproduced 4945 // on non-deprecated ptxas versions. 4946 if (ForceMinByValParamAlign) 4947 ArgAlign = std::max(ArgAlign, Align(4)); 4948 4949 return ArgAlign; 4950 } 4951 4952 // Helper for getting a function parameter name. Name is composed from 4953 // its index and the function name. Negative index corresponds to special 4954 // parameter (unsized array) used for passing variable arguments. 4955 std::string NVPTXTargetLowering::getParamName(const Function *F, 4956 int Idx) const { 4957 std::string ParamName; 4958 raw_string_ostream ParamStr(ParamName); 4959 4960 ParamStr << getTargetMachine().getSymbol(F)->getName(); 4961 if (Idx < 0) 4962 ParamStr << "_vararg"; 4963 else 4964 ParamStr << "_param_" << Idx; 4965 4966 return ParamName; 4967 } 4968 4969 /// isLegalAddressingMode - Return true if the addressing mode represented 4970 /// by AM is legal for this target, for a load/store of the specified type. 4971 /// Used to guide target specific optimizations, like loop strength reduction 4972 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4973 /// (CodeGenPrepare.cpp) 4974 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4975 const AddrMode &AM, Type *Ty, 4976 unsigned AS, Instruction *I) const { 4977 // AddrMode - This represents an addressing mode of: 4978 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4979 // 4980 // The legal address modes are 4981 // - [avar] 4982 // - [areg] 4983 // - [areg+immoff] 4984 // - [immAddr] 4985 4986 if (AM.BaseGV) { 4987 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4988 } 4989 4990 switch (AM.Scale) { 4991 case 0: // "r", "r+i" or "i" is allowed 4992 break; 4993 case 1: 4994 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4995 return false; 4996 // Otherwise we have r+i. 4997 break; 4998 default: 4999 // No scale > 1 is allowed 5000 return false; 5001 } 5002 return true; 5003 } 5004 5005 //===----------------------------------------------------------------------===// 5006 // NVPTX Inline Assembly Support 5007 //===----------------------------------------------------------------------===// 5008 5009 /// getConstraintType - Given a constraint letter, return the type of 5010 /// constraint it is for this target. 5011 NVPTXTargetLowering::ConstraintType 5012 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 5013 if (Constraint.size() == 1) { 5014 switch (Constraint[0]) { 5015 default: 5016 break; 5017 case 'b': 5018 case 'r': 5019 case 'h': 5020 case 'c': 5021 case 'l': 5022 case 'f': 5023 case 'd': 5024 case '0': 5025 case 'N': 5026 return C_RegisterClass; 5027 } 5028 } 5029 return TargetLowering::getConstraintType(Constraint); 5030 } 5031 5032 std::pair<unsigned, const TargetRegisterClass *> 5033 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 5034 StringRef Constraint, 5035 MVT VT) const { 5036 if (Constraint.size() == 1) { 5037 switch (Constraint[0]) { 5038 case 'b': 5039 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 5040 case 'c': 5041 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5042 case 'h': 5043 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5044 case 'r': 5045 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 5046 case 'l': 5047 case 'N': 5048 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 5049 case 'f': 5050 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 5051 case 'd': 5052 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 5053 } 5054 } 5055 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5056 } 5057 5058 //===----------------------------------------------------------------------===// 5059 // NVPTX DAG Combining 5060 //===----------------------------------------------------------------------===// 5061 5062 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 5063 CodeGenOptLevel OptLevel) const { 5064 // Always honor command-line argument 5065 if (FMAContractLevelOpt.getNumOccurrences() > 0) 5066 return FMAContractLevelOpt > 0; 5067 5068 // Do not contract if we're not optimizing the code. 5069 if (OptLevel == CodeGenOptLevel::None) 5070 return false; 5071 5072 // Honor TargetOptions flags that explicitly say fusion is okay. 5073 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 5074 return true; 5075 5076 return allowUnsafeFPMath(MF); 5077 } 5078 5079 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 5080 // Honor TargetOptions flags that explicitly say unsafe math is okay. 5081 if (MF.getTarget().Options.UnsafeFPMath) 5082 return true; 5083 5084 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 5085 const Function &F = MF.getFunction(); 5086 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 5087 } 5088 5089 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 5090 /// operands N0 and N1. This is a helper for PerformADDCombine that is 5091 /// called with the default operands, and if that fails, with commuted 5092 /// operands. 5093 static SDValue PerformADDCombineWithOperands( 5094 SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, 5095 const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) { 5096 SelectionDAG &DAG = DCI.DAG; 5097 // Skip non-integer, non-scalar case 5098 EVT VT=N0.getValueType(); 5099 if (VT.isVector()) 5100 return SDValue(); 5101 5102 // fold (add (mul a, b), c) -> (mad a, b, c) 5103 // 5104 if (N0.getOpcode() == ISD::MUL) { 5105 assert (VT.isInteger()); 5106 // For integer: 5107 // Since integer multiply-add costs the same as integer multiply 5108 // but is more costly than integer add, do the fusion only when 5109 // the mul is only used in the add. 5110 if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 || 5111 !N0.getNode()->hasOneUse()) 5112 return SDValue(); 5113 5114 // Do the folding 5115 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 5116 N0.getOperand(0), N0.getOperand(1), N1); 5117 } 5118 else if (N0.getOpcode() == ISD::FMUL) { 5119 if (VT == MVT::f32 || VT == MVT::f64) { 5120 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 5121 &DAG.getTargetLoweringInfo()); 5122 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 5123 return SDValue(); 5124 5125 // For floating point: 5126 // Do the fusion only when the mul has less than 5 uses and all 5127 // are add. 5128 // The heuristic is that if a use is not an add, then that use 5129 // cannot be fused into fma, therefore mul is still needed anyway. 5130 // If there are more than 4 uses, even if they are all add, fusing 5131 // them will increase register pressue. 5132 // 5133 int numUses = 0; 5134 int nonAddCount = 0; 5135 for (const SDNode *User : N0.getNode()->uses()) { 5136 numUses++; 5137 if (User->getOpcode() != ISD::FADD) 5138 ++nonAddCount; 5139 } 5140 if (numUses >= 5) 5141 return SDValue(); 5142 if (nonAddCount) { 5143 int orderNo = N->getIROrder(); 5144 int orderNo2 = N0.getNode()->getIROrder(); 5145 // simple heuristics here for considering potential register 5146 // pressure, the logics here is that the differnce are used 5147 // to measure the distance between def and use, the longer distance 5148 // more likely cause register pressure. 5149 if (orderNo - orderNo2 < 500) 5150 return SDValue(); 5151 5152 // Now, check if at least one of the FMUL's operands is live beyond the node N, 5153 // which guarantees that the FMA will not increase register pressure at node N. 5154 bool opIsLive = false; 5155 const SDNode *left = N0.getOperand(0).getNode(); 5156 const SDNode *right = N0.getOperand(1).getNode(); 5157 5158 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 5159 opIsLive = true; 5160 5161 if (!opIsLive) 5162 for (const SDNode *User : left->uses()) { 5163 int orderNo3 = User->getIROrder(); 5164 if (orderNo3 > orderNo) { 5165 opIsLive = true; 5166 break; 5167 } 5168 } 5169 5170 if (!opIsLive) 5171 for (const SDNode *User : right->uses()) { 5172 int orderNo3 = User->getIROrder(); 5173 if (orderNo3 > orderNo) { 5174 opIsLive = true; 5175 break; 5176 } 5177 } 5178 5179 if (!opIsLive) 5180 return SDValue(); 5181 } 5182 5183 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 5184 N0.getOperand(0), N0.getOperand(1), N1); 5185 } 5186 } 5187 5188 return SDValue(); 5189 } 5190 5191 static SDValue PerformStoreRetvalCombine(SDNode *N) { 5192 // Operands from the 2nd to the last one are the values to be stored 5193 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 5194 if (!N->getOperand(I).isUndef()) 5195 return SDValue(); 5196 5197 // Operand 0 is the previous value in the chain. Cannot return EntryToken 5198 // as the previous value will become unused and eliminated later. 5199 return N->getOperand(0); 5200 } 5201 5202 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 5203 /// 5204 static SDValue PerformADDCombine(SDNode *N, 5205 TargetLowering::DAGCombinerInfo &DCI, 5206 const NVPTXSubtarget &Subtarget, 5207 CodeGenOptLevel OptLevel) { 5208 SDValue N0 = N->getOperand(0); 5209 SDValue N1 = N->getOperand(1); 5210 5211 // First try with the default operand order. 5212 if (SDValue Result = 5213 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 5214 return Result; 5215 5216 // If that didn't work, try again with the operands commuted. 5217 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 5218 } 5219 5220 static SDValue PerformANDCombine(SDNode *N, 5221 TargetLowering::DAGCombinerInfo &DCI) { 5222 // The type legalizer turns a vector load of i8 values into a zextload to i16 5223 // registers, optionally ANY_EXTENDs it (if target type is integer), 5224 // and ANDs off the high 8 bits. Since we turn this load into a 5225 // target-specific DAG node, the DAG combiner fails to eliminate these AND 5226 // nodes. Do that here. 5227 SDValue Val = N->getOperand(0); 5228 SDValue Mask = N->getOperand(1); 5229 5230 if (isa<ConstantSDNode>(Val)) { 5231 std::swap(Val, Mask); 5232 } 5233 5234 SDValue AExt; 5235 5236 // Convert BFE-> truncate i16 -> and 255 5237 // To just BFE-> truncate i16, as the value already has all the bits in the 5238 // right places. 5239 if (Val.getOpcode() == ISD::TRUNCATE) { 5240 SDValue BFE = Val.getOperand(0); 5241 if (BFE.getOpcode() != NVPTXISD::BFE) 5242 return SDValue(); 5243 5244 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); 5245 if (!BFEBits) 5246 return SDValue(); 5247 uint64_t BFEBitsVal = BFEBits->getZExtValue(); 5248 5249 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5250 if (!MaskCnst) { 5251 // Not an AND with a constant 5252 return SDValue(); 5253 } 5254 uint64_t MaskVal = MaskCnst->getZExtValue(); 5255 5256 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) 5257 return SDValue(); 5258 // If we get here, the AND is unnecessary. Just replace it with the trunc 5259 DCI.CombineTo(N, Val, false); 5260 } 5261 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 5262 if (Val.getOpcode() == ISD::ANY_EXTEND) { 5263 AExt = Val; 5264 Val = Val->getOperand(0); 5265 } 5266 5267 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 5268 Val = Val->getOperand(0); 5269 } 5270 5271 if (Val->getOpcode() == NVPTXISD::LoadV2 || 5272 Val->getOpcode() == NVPTXISD::LoadV4) { 5273 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5274 if (!MaskCnst) { 5275 // Not an AND with a constant 5276 return SDValue(); 5277 } 5278 5279 uint64_t MaskVal = MaskCnst->getZExtValue(); 5280 if (MaskVal != 0xff) { 5281 // Not an AND that chops off top 8 bits 5282 return SDValue(); 5283 } 5284 5285 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 5286 if (!Mem) { 5287 // Not a MemSDNode?!? 5288 return SDValue(); 5289 } 5290 5291 EVT MemVT = Mem->getMemoryVT(); 5292 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 5293 // We only handle the i8 case 5294 return SDValue(); 5295 } 5296 5297 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1); 5298 if (ExtType == ISD::SEXTLOAD) { 5299 // If for some reason the load is a sextload, the and is needed to zero 5300 // out the high 8 bits 5301 return SDValue(); 5302 } 5303 5304 bool AddTo = false; 5305 if (AExt.getNode() != nullptr) { 5306 // Re-insert the ext as a zext. 5307 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5308 AExt.getValueType(), Val); 5309 AddTo = true; 5310 } 5311 5312 // If we get here, the AND is unnecessary. Just replace it with the load 5313 DCI.CombineTo(N, Val, AddTo); 5314 } 5315 5316 return SDValue(); 5317 } 5318 5319 static SDValue PerformREMCombine(SDNode *N, 5320 TargetLowering::DAGCombinerInfo &DCI, 5321 CodeGenOptLevel OptLevel) { 5322 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 5323 5324 // Don't do anything at less than -O2. 5325 if (OptLevel < CodeGenOptLevel::Default) 5326 return SDValue(); 5327 5328 SelectionDAG &DAG = DCI.DAG; 5329 SDLoc DL(N); 5330 EVT VT = N->getValueType(0); 5331 bool IsSigned = N->getOpcode() == ISD::SREM; 5332 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 5333 5334 const SDValue &Num = N->getOperand(0); 5335 const SDValue &Den = N->getOperand(1); 5336 5337 for (const SDNode *U : Num->uses()) { 5338 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 5339 U->getOperand(1) == Den) { 5340 // Num % Den -> Num - (Num / Den) * Den 5341 return DAG.getNode(ISD::SUB, DL, VT, Num, 5342 DAG.getNode(ISD::MUL, DL, VT, 5343 DAG.getNode(DivOpc, DL, VT, Num, Den), 5344 Den)); 5345 } 5346 } 5347 return SDValue(); 5348 } 5349 5350 enum OperandSignedness { 5351 Signed = 0, 5352 Unsigned, 5353 Unknown 5354 }; 5355 5356 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 5357 /// that can be demoted to \p OptSize bits without loss of information. The 5358 /// signedness of the operand, if determinable, is placed in \p S. 5359 static bool IsMulWideOperandDemotable(SDValue Op, 5360 unsigned OptSize, 5361 OperandSignedness &S) { 5362 S = Unknown; 5363 5364 if (Op.getOpcode() == ISD::SIGN_EXTEND || 5365 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 5366 EVT OrigVT = Op.getOperand(0).getValueType(); 5367 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5368 S = Signed; 5369 return true; 5370 } 5371 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 5372 EVT OrigVT = Op.getOperand(0).getValueType(); 5373 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5374 S = Unsigned; 5375 return true; 5376 } 5377 } 5378 5379 return false; 5380 } 5381 5382 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 5383 /// be demoted to \p OptSize bits without loss of information. If the operands 5384 /// contain a constant, it should appear as the RHS operand. The signedness of 5385 /// the operands is placed in \p IsSigned. 5386 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 5387 unsigned OptSize, 5388 bool &IsSigned) { 5389 OperandSignedness LHSSign; 5390 5391 // The LHS operand must be a demotable op 5392 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5393 return false; 5394 5395 // We should have been able to determine the signedness from the LHS 5396 if (LHSSign == Unknown) 5397 return false; 5398 5399 IsSigned = (LHSSign == Signed); 5400 5401 // The RHS can be a demotable op or a constant 5402 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5403 const APInt &Val = CI->getAPIntValue(); 5404 if (LHSSign == Unsigned) { 5405 return Val.isIntN(OptSize); 5406 } else { 5407 return Val.isSignedIntN(OptSize); 5408 } 5409 } else { 5410 OperandSignedness RHSSign; 5411 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5412 return false; 5413 5414 return LHSSign == RHSSign; 5415 } 5416 } 5417 5418 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5419 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5420 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5421 /// amount. 5422 static SDValue TryMULWIDECombine(SDNode *N, 5423 TargetLowering::DAGCombinerInfo &DCI) { 5424 EVT MulType = N->getValueType(0); 5425 if (MulType != MVT::i32 && MulType != MVT::i64) { 5426 return SDValue(); 5427 } 5428 5429 SDLoc DL(N); 5430 unsigned OptSize = MulType.getSizeInBits() >> 1; 5431 SDValue LHS = N->getOperand(0); 5432 SDValue RHS = N->getOperand(1); 5433 5434 // Canonicalize the multiply so the constant (if any) is on the right 5435 if (N->getOpcode() == ISD::MUL) { 5436 if (isa<ConstantSDNode>(LHS)) { 5437 std::swap(LHS, RHS); 5438 } 5439 } 5440 5441 // If we have a SHL, determine the actual multiply amount 5442 if (N->getOpcode() == ISD::SHL) { 5443 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5444 if (!ShlRHS) { 5445 return SDValue(); 5446 } 5447 5448 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5449 unsigned BitWidth = MulType.getSizeInBits(); 5450 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5451 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5452 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5453 } else { 5454 return SDValue(); 5455 } 5456 } 5457 5458 bool Signed; 5459 // Verify that our operands are demotable 5460 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5461 return SDValue(); 5462 } 5463 5464 EVT DemotedVT; 5465 if (MulType == MVT::i32) { 5466 DemotedVT = MVT::i16; 5467 } else { 5468 DemotedVT = MVT::i32; 5469 } 5470 5471 // Truncate the operands to the correct size. Note that these are just for 5472 // type consistency and will (likely) be eliminated in later phases. 5473 SDValue TruncLHS = 5474 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5475 SDValue TruncRHS = 5476 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5477 5478 unsigned Opc; 5479 if (Signed) { 5480 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5481 } else { 5482 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5483 } 5484 5485 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5486 } 5487 5488 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5489 static SDValue PerformMULCombine(SDNode *N, 5490 TargetLowering::DAGCombinerInfo &DCI, 5491 CodeGenOptLevel OptLevel) { 5492 if (OptLevel > CodeGenOptLevel::None) { 5493 // Try mul.wide combining at OptLevel > 0 5494 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5495 return Ret; 5496 } 5497 5498 return SDValue(); 5499 } 5500 5501 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5502 static SDValue PerformSHLCombine(SDNode *N, 5503 TargetLowering::DAGCombinerInfo &DCI, 5504 CodeGenOptLevel OptLevel) { 5505 if (OptLevel > CodeGenOptLevel::None) { 5506 // Try mul.wide combining at OptLevel > 0 5507 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5508 return Ret; 5509 } 5510 5511 return SDValue(); 5512 } 5513 5514 static SDValue PerformSETCCCombine(SDNode *N, 5515 TargetLowering::DAGCombinerInfo &DCI, 5516 unsigned int SmVersion) { 5517 EVT CCType = N->getValueType(0); 5518 SDValue A = N->getOperand(0); 5519 SDValue B = N->getOperand(1); 5520 5521 EVT AType = A.getValueType(); 5522 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) 5523 return SDValue(); 5524 5525 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) 5526 return SDValue(); 5527 5528 SDLoc DL(N); 5529 // setp.f16x2 returns two scalar predicates, which we need to 5530 // convert back to v2i1. The returned result will be scalarized by 5531 // the legalizer, but the comparison will remain a single vector 5532 // instruction. 5533 SDValue CCNode = DCI.DAG.getNode( 5534 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 5535 : NVPTXISD::SETP_BF16X2, 5536 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); 5537 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5538 CCNode.getValue(1)); 5539 } 5540 5541 static SDValue PerformEXTRACTCombine(SDNode *N, 5542 TargetLowering::DAGCombinerInfo &DCI) { 5543 SDValue Vector = N->getOperand(0); 5544 SDLoc DL(N); 5545 EVT VectorVT = Vector.getValueType(); 5546 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && 5547 IsPTXVectorType(VectorVT.getSimpleVT())) 5548 return SDValue(); // Native vector loads already combine nicely w/ 5549 // extract_vector_elt, except for v4i8. 5550 // Don't mess with singletons or v2*16 types, we already handle them OK. 5551 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || 5552 VectorVT == MVT::v4i8) 5553 return SDValue(); 5554 5555 uint64_t VectorBits = VectorVT.getSizeInBits(); 5556 // We only handle the types we can extract in-register. 5557 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) 5558 return SDValue(); 5559 5560 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5561 // Index == 0 is handled by generic DAG combiner. 5562 if (!Index || Index->getZExtValue() == 0) 5563 return SDValue(); 5564 5565 MVT IVT = MVT::getIntegerVT(VectorBits); 5566 EVT EltVT = VectorVT.getVectorElementType(); 5567 EVT EltIVT = EltVT.changeTypeToInteger(); 5568 uint64_t EltBits = EltVT.getScalarSizeInBits(); 5569 5570 SDValue Result = DCI.DAG.getNode( 5571 ISD::TRUNCATE, DL, EltIVT, 5572 DCI.DAG.getNode( 5573 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector), 5574 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); 5575 5576 // If element has non-integer type, bitcast it back to the expected type. 5577 if (EltVT != EltIVT) 5578 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); 5579 // Past legalizer, we may need to extent i8 -> i16 to match the register type. 5580 if (EltVT != N->getValueType(0)) 5581 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); 5582 5583 return Result; 5584 } 5585 5586 static SDValue PerformVSELECTCombine(SDNode *N, 5587 TargetLowering::DAGCombinerInfo &DCI) { 5588 SDValue VA = N->getOperand(1); 5589 EVT VectorVT = VA.getValueType(); 5590 if (VectorVT != MVT::v4i8) 5591 return SDValue(); 5592 5593 // We need to split vselect into individual per-element operations Because we 5594 // use BFE/BFI instruction for byte extraction/insertion, we do end up with 5595 // 32-bit values, so we may as well do comparison as i32 to avoid conversions 5596 // to/from i16 normally used for i8 values. 5597 SmallVector<SDValue, 4> E; 5598 SDLoc DL(N); 5599 SDValue VCond = N->getOperand(0); 5600 SDValue VB = N->getOperand(2); 5601 for (int I = 0; I < 4; ++I) { 5602 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, 5603 DCI.DAG.getConstant(I, DL, MVT::i32)); 5604 SDValue EA = DCI.DAG.getAnyExtOrTrunc( 5605 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, 5606 DCI.DAG.getConstant(I, DL, MVT::i32)), 5607 DL, MVT::i32); 5608 SDValue EB = DCI.DAG.getAnyExtOrTrunc( 5609 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, 5610 DCI.DAG.getConstant(I, DL, MVT::i32)), 5611 DL, MVT::i32); 5612 E.push_back(DCI.DAG.getAnyExtOrTrunc( 5613 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); 5614 } 5615 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); 5616 } 5617 5618 static SDValue PerformLOADCombine(SDNode *N, 5619 TargetLowering::DAGCombinerInfo &DCI) { 5620 SelectionDAG &DAG = DCI.DAG; 5621 LoadSDNode *LD = cast<LoadSDNode>(N); 5622 5623 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of 5624 // letting ReplaceLoadVector split it into smaller loads during legalization. 5625 // This is done at dag-combine1 time, so that vector operations with i8 5626 // elements can be optimised away instead of being needlessly split during 5627 // legalization, which involves storing to the stack and loading it back. 5628 EVT VT = N->getValueType(0); 5629 if (VT != MVT::v16i8) 5630 return SDValue(); 5631 5632 SDLoc DL(N); 5633 5634 // Create a v4i32 vector load operation, effectively <4 x v4i8>. 5635 unsigned Opc = NVPTXISD::LoadV4; 5636 EVT NewVT = MVT::v4i32; 5637 EVT EltVT = NewVT.getVectorElementType(); 5638 unsigned NumElts = NewVT.getVectorNumElements(); 5639 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; 5640 SDVTList RetVTList = DAG.getVTList(RetVTs); 5641 SmallVector<SDValue, 8> Ops(N->ops()); 5642 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5643 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, 5644 LD->getMemOperand()); 5645 SDValue NewChain = NewLoad.getValue(NumElts); 5646 5647 // Create a vector of the same type returned by the original load. 5648 SmallVector<SDValue, 4> Elts; 5649 for (unsigned i = 0; i < NumElts; i++) 5650 Elts.push_back(NewLoad.getValue(i)); 5651 return DCI.DAG.getMergeValues( 5652 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), 5653 NewChain}, 5654 DL); 5655 } 5656 5657 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 5658 DAGCombinerInfo &DCI) const { 5659 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); 5660 switch (N->getOpcode()) { 5661 default: break; 5662 case ISD::ADD: 5663 case ISD::FADD: 5664 return PerformADDCombine(N, DCI, STI, OptLevel); 5665 case ISD::MUL: 5666 return PerformMULCombine(N, DCI, OptLevel); 5667 case ISD::SHL: 5668 return PerformSHLCombine(N, DCI, OptLevel); 5669 case ISD::AND: 5670 return PerformANDCombine(N, DCI); 5671 case ISD::UREM: 5672 case ISD::SREM: 5673 return PerformREMCombine(N, DCI, OptLevel); 5674 case ISD::SETCC: 5675 return PerformSETCCCombine(N, DCI, STI.getSmVersion()); 5676 case ISD::LOAD: 5677 return PerformLOADCombine(N, DCI); 5678 case NVPTXISD::StoreRetval: 5679 case NVPTXISD::StoreRetvalV2: 5680 case NVPTXISD::StoreRetvalV4: 5681 return PerformStoreRetvalCombine(N); 5682 case ISD::EXTRACT_VECTOR_ELT: 5683 return PerformEXTRACTCombine(N, DCI); 5684 case ISD::VSELECT: 5685 return PerformVSELECTCombine(N, DCI); 5686 } 5687 return SDValue(); 5688 } 5689 5690 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5691 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 5692 SmallVectorImpl<SDValue> &Results) { 5693 EVT ResVT = N->getValueType(0); 5694 SDLoc DL(N); 5695 5696 assert(ResVT.isVector() && "Vector load must have vector type"); 5697 5698 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 5699 // legal. We can (and should) split that into 2 loads of <2 x double> here 5700 // but I'm leaving that as a TODO for now. 5701 assert(ResVT.isSimple() && "Can only handle simple types"); 5702 switch (ResVT.getSimpleVT().SimpleTy) { 5703 default: 5704 return; 5705 case MVT::v2i8: 5706 case MVT::v2i16: 5707 case MVT::v2i32: 5708 case MVT::v2i64: 5709 case MVT::v2f16: 5710 case MVT::v2f32: 5711 case MVT::v2f64: 5712 case MVT::v4i8: 5713 case MVT::v4i16: 5714 case MVT::v4i32: 5715 case MVT::v4f16: 5716 case MVT::v4f32: 5717 case MVT::v8f16: // <4 x f16x2> 5718 case MVT::v8bf16: // <4 x bf16x2> 5719 case MVT::v8i16: // <4 x i16x2> 5720 // This is a "native" vector type 5721 break; 5722 } 5723 5724 LoadSDNode *LD = cast<LoadSDNode>(N); 5725 5726 Align Alignment = LD->getAlign(); 5727 auto &TD = DAG.getDataLayout(); 5728 Align PrefAlign = 5729 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 5730 if (Alignment < PrefAlign) { 5731 // This load is not sufficiently aligned, so bail out and let this vector 5732 // load be scalarized. Note that we may still be able to emit smaller 5733 // vector loads. For example, if we are loading a <4 x float> with an 5734 // alignment of 8, this check will fail but the legalizer will try again 5735 // with 2 x <2 x float>, which will succeed with an alignment of 8. 5736 return; 5737 } 5738 5739 EVT EltVT = ResVT.getVectorElementType(); 5740 unsigned NumElts = ResVT.getVectorNumElements(); 5741 5742 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5743 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5744 // loaded type to i16 and propagate the "real" type as the memory type. 5745 bool NeedTrunc = false; 5746 if (EltVT.getSizeInBits() < 16) { 5747 EltVT = MVT::i16; 5748 NeedTrunc = true; 5749 } 5750 5751 unsigned Opcode = 0; 5752 SDVTList LdResVTs; 5753 bool Load16x2 = false; 5754 5755 switch (NumElts) { 5756 default: 5757 return; 5758 case 2: 5759 Opcode = NVPTXISD::LoadV2; 5760 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5761 break; 5762 case 4: { 5763 Opcode = NVPTXISD::LoadV4; 5764 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5765 LdResVTs = DAG.getVTList(ListVTs); 5766 break; 5767 } 5768 case 8: { 5769 // v8f16 is a special case. PTX doesn't have ld.v8.f16 5770 // instruction. Instead, we split the vector into v2f16 chunks and 5771 // load them with ld.v4.b32. 5772 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); 5773 Load16x2 = true; 5774 Opcode = NVPTXISD::LoadV4; 5775 EVT VVT; 5776 switch (EltVT.getSimpleVT().SimpleTy) { 5777 case MVT::f16: 5778 VVT = MVT::v2f16; 5779 break; 5780 case MVT::bf16: 5781 VVT = MVT::v2bf16; 5782 break; 5783 case MVT::i16: 5784 VVT = MVT::v2i16; 5785 break; 5786 default: 5787 llvm_unreachable("Unsupported v8 vector type."); 5788 } 5789 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 5790 LdResVTs = DAG.getVTList(ListVTs); 5791 break; 5792 } 5793 } 5794 5795 // Copy regular operands 5796 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5797 5798 // The select routine does not have access to the LoadSDNode instance, so 5799 // pass along the extension information 5800 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5801 5802 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5803 LD->getMemoryVT(), 5804 LD->getMemOperand()); 5805 5806 SmallVector<SDValue, 8> ScalarRes; 5807 if (Load16x2) { 5808 // Split v2f16 subvectors back into individual elements. 5809 NumElts /= 2; 5810 for (unsigned i = 0; i < NumElts; ++i) { 5811 SDValue SubVector = NewLD.getValue(i); 5812 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5813 DAG.getIntPtrConstant(0, DL)); 5814 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5815 DAG.getIntPtrConstant(1, DL)); 5816 ScalarRes.push_back(E0); 5817 ScalarRes.push_back(E1); 5818 } 5819 } else { 5820 for (unsigned i = 0; i < NumElts; ++i) { 5821 SDValue Res = NewLD.getValue(i); 5822 if (NeedTrunc) 5823 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5824 ScalarRes.push_back(Res); 5825 } 5826 } 5827 5828 SDValue LoadChain = NewLD.getValue(NumElts); 5829 5830 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5831 5832 Results.push_back(BuildVec); 5833 Results.push_back(LoadChain); 5834 } 5835 5836 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5837 SmallVectorImpl<SDValue> &Results) { 5838 SDValue Chain = N->getOperand(0); 5839 SDValue Intrin = N->getOperand(1); 5840 SDLoc DL(N); 5841 5842 // Get the intrinsic ID 5843 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); 5844 switch (IntrinNo) { 5845 default: 5846 return; 5847 case Intrinsic::nvvm_ldg_global_i: 5848 case Intrinsic::nvvm_ldg_global_f: 5849 case Intrinsic::nvvm_ldg_global_p: 5850 case Intrinsic::nvvm_ldu_global_i: 5851 case Intrinsic::nvvm_ldu_global_f: 5852 case Intrinsic::nvvm_ldu_global_p: { 5853 EVT ResVT = N->getValueType(0); 5854 5855 if (ResVT.isVector()) { 5856 // Vector LDG/LDU 5857 5858 unsigned NumElts = ResVT.getVectorNumElements(); 5859 EVT EltVT = ResVT.getVectorElementType(); 5860 5861 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5862 // legalization. 5863 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5864 // loaded type to i16 and propagate the "real" type as the memory type. 5865 bool NeedTrunc = false; 5866 if (EltVT.getSizeInBits() < 16) { 5867 EltVT = MVT::i16; 5868 NeedTrunc = true; 5869 } 5870 5871 unsigned Opcode = 0; 5872 SDVTList LdResVTs; 5873 5874 switch (NumElts) { 5875 default: 5876 return; 5877 case 2: 5878 switch (IntrinNo) { 5879 default: 5880 return; 5881 case Intrinsic::nvvm_ldg_global_i: 5882 case Intrinsic::nvvm_ldg_global_f: 5883 case Intrinsic::nvvm_ldg_global_p: 5884 Opcode = NVPTXISD::LDGV2; 5885 break; 5886 case Intrinsic::nvvm_ldu_global_i: 5887 case Intrinsic::nvvm_ldu_global_f: 5888 case Intrinsic::nvvm_ldu_global_p: 5889 Opcode = NVPTXISD::LDUV2; 5890 break; 5891 } 5892 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5893 break; 5894 case 4: { 5895 switch (IntrinNo) { 5896 default: 5897 return; 5898 case Intrinsic::nvvm_ldg_global_i: 5899 case Intrinsic::nvvm_ldg_global_f: 5900 case Intrinsic::nvvm_ldg_global_p: 5901 Opcode = NVPTXISD::LDGV4; 5902 break; 5903 case Intrinsic::nvvm_ldu_global_i: 5904 case Intrinsic::nvvm_ldu_global_f: 5905 case Intrinsic::nvvm_ldu_global_p: 5906 Opcode = NVPTXISD::LDUV4; 5907 break; 5908 } 5909 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5910 LdResVTs = DAG.getVTList(ListVTs); 5911 break; 5912 } 5913 } 5914 5915 SmallVector<SDValue, 8> OtherOps; 5916 5917 // Copy regular operands 5918 5919 OtherOps.push_back(Chain); // Chain 5920 // Skip operand 1 (intrinsic ID) 5921 // Others 5922 OtherOps.append(N->op_begin() + 2, N->op_end()); 5923 5924 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5925 5926 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5927 MemSD->getMemoryVT(), 5928 MemSD->getMemOperand()); 5929 5930 SmallVector<SDValue, 4> ScalarRes; 5931 5932 for (unsigned i = 0; i < NumElts; ++i) { 5933 SDValue Res = NewLD.getValue(i); 5934 if (NeedTrunc) 5935 Res = 5936 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5937 ScalarRes.push_back(Res); 5938 } 5939 5940 SDValue LoadChain = NewLD.getValue(NumElts); 5941 5942 SDValue BuildVec = 5943 DAG.getBuildVector(ResVT, DL, ScalarRes); 5944 5945 Results.push_back(BuildVec); 5946 Results.push_back(LoadChain); 5947 } else { 5948 // i8 LDG/LDU 5949 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5950 "Custom handling of non-i8 ldu/ldg?"); 5951 5952 // Just copy all operands as-is 5953 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5954 5955 // Force output to i16 5956 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5957 5958 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5959 5960 // We make sure the memory type is i8, which will be used during isel 5961 // to select the proper instruction. 5962 SDValue NewLD = 5963 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5964 MVT::i8, MemSD->getMemOperand()); 5965 5966 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5967 NewLD.getValue(0))); 5968 Results.push_back(NewLD.getValue(1)); 5969 } 5970 } 5971 } 5972 } 5973 5974 void NVPTXTargetLowering::ReplaceNodeResults( 5975 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5976 switch (N->getOpcode()) { 5977 default: 5978 report_fatal_error("Unhandled custom legalization"); 5979 case ISD::LOAD: 5980 ReplaceLoadVector(N, DAG, Results); 5981 return; 5982 case ISD::INTRINSIC_W_CHAIN: 5983 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5984 return; 5985 } 5986 } 5987 5988 NVPTXTargetLowering::AtomicExpansionKind 5989 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5990 Type *Ty = AI->getValOperand()->getType(); 5991 5992 if (AI->isFloatingPointOperation()) { 5993 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5994 if (Ty->isFloatTy()) 5995 return AtomicExpansionKind::None; 5996 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5997 return AtomicExpansionKind::None; 5998 } 5999 return AtomicExpansionKind::CmpXChg; 6000 } 6001 6002 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 6003 auto ITy = cast<llvm::IntegerType>(Ty); 6004 6005 switch (AI->getOperation()) { 6006 default: 6007 return AtomicExpansionKind::CmpXChg; 6008 case AtomicRMWInst::BinOp::And: 6009 case AtomicRMWInst::BinOp::Or: 6010 case AtomicRMWInst::BinOp::Xor: 6011 case AtomicRMWInst::BinOp::Xchg: 6012 switch (ITy->getBitWidth()) { 6013 case 8: 6014 case 16: 6015 return AtomicExpansionKind::CmpXChg; 6016 case 32: 6017 return AtomicExpansionKind::None; 6018 case 64: 6019 if (STI.hasAtomBitwise64()) 6020 return AtomicExpansionKind::None; 6021 return AtomicExpansionKind::CmpXChg; 6022 default: 6023 llvm_unreachable("unsupported width encountered"); 6024 } 6025 case AtomicRMWInst::BinOp::Add: 6026 case AtomicRMWInst::BinOp::Sub: 6027 case AtomicRMWInst::BinOp::Max: 6028 case AtomicRMWInst::BinOp::Min: 6029 case AtomicRMWInst::BinOp::UMax: 6030 case AtomicRMWInst::BinOp::UMin: 6031 switch (ITy->getBitWidth()) { 6032 case 8: 6033 case 16: 6034 return AtomicExpansionKind::CmpXChg; 6035 case 32: 6036 return AtomicExpansionKind::None; 6037 case 64: 6038 if (STI.hasAtomMinMax64()) 6039 return AtomicExpansionKind::None; 6040 return AtomicExpansionKind::CmpXChg; 6041 default: 6042 llvm_unreachable("unsupported width encountered"); 6043 } 6044 } 6045 6046 return AtomicExpansionKind::CmpXChg; 6047 } 6048 6049 // Pin NVPTXTargetObjectFile's vtables to this file. 6050 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 6051 6052 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 6053 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 6054 return getDataSection(); 6055 } 6056