1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/SelectionDAG.h" 29 #include "llvm/CodeGen/SelectionDAGNodes.h" 30 #include "llvm/CodeGen/TargetCallingConv.h" 31 #include "llvm/CodeGen/TargetLowering.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/Argument.h" 34 #include "llvm/IR/Attributes.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/FPEnv.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/GlobalValue.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicsNVPTX.h" 44 #include "llvm/IR/Module.h" 45 #include "llvm/IR/Type.h" 46 #include "llvm/IR/Value.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CodeGen.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/MachineValueType.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cmath> 58 #include <cstdint> 59 #include <iterator> 60 #include <sstream> 61 #include <string> 62 #include <utility> 63 #include <vector> 64 65 #define DEBUG_TYPE "nvptx-lower" 66 67 using namespace llvm; 68 69 static std::atomic<unsigned> GlobalUniqueCallSite; 70 71 static cl::opt<bool> sched4reg( 72 "nvptx-sched4reg", 73 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 74 75 static cl::opt<unsigned> FMAContractLevelOpt( 76 "nvptx-fma-level", cl::Hidden, 77 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 78 " 1: do it 2: do it aggressively"), 79 cl::init(2)); 80 81 static cl::opt<int> UsePrecDivF32( 82 "nvptx-prec-divf32", cl::Hidden, 83 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 84 " IEEE Compliant F32 div.rnd if available."), 85 cl::init(2)); 86 87 static cl::opt<bool> UsePrecSqrtF32( 88 "nvptx-prec-sqrtf32", cl::Hidden, 89 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 90 cl::init(true)); 91 92 int NVPTXTargetLowering::getDivF32Level() const { 93 if (UsePrecDivF32.getNumOccurrences() > 0) { 94 // If nvptx-prec-div32=N is used on the command-line, always honor it 95 return UsePrecDivF32; 96 } else { 97 // Otherwise, use div.approx if fast math is enabled 98 if (getTargetMachine().Options.UnsafeFPMath) 99 return 0; 100 else 101 return 2; 102 } 103 } 104 105 bool NVPTXTargetLowering::usePrecSqrtF32() const { 106 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 107 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 108 return UsePrecSqrtF32; 109 } else { 110 // Otherwise, use sqrt.approx if fast math is enabled 111 return !getTargetMachine().Options.UnsafeFPMath; 112 } 113 } 114 115 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 116 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 117 DenormalMode::PreserveSign; 118 } 119 120 static bool IsPTXVectorType(MVT VT) { 121 switch (VT.SimpleTy) { 122 default: 123 return false; 124 case MVT::v2i1: 125 case MVT::v4i1: 126 case MVT::v2i8: 127 case MVT::v4i8: 128 case MVT::v2i16: 129 case MVT::v4i16: 130 case MVT::v2i32: 131 case MVT::v4i32: 132 case MVT::v2i64: 133 case MVT::v2f16: 134 case MVT::v4f16: 135 case MVT::v8f16: // <4 x f16x2> 136 case MVT::v2bf16: 137 case MVT::v4bf16: 138 case MVT::v8bf16: // <4 x bf16x2> 139 case MVT::v2f32: 140 case MVT::v4f32: 141 case MVT::v2f64: 142 return true; 143 } 144 } 145 146 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 147 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 148 /// into their primitive components. 149 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 150 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 151 /// LowerCall, and LowerReturn. 152 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 153 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 154 SmallVectorImpl<uint64_t> *Offsets = nullptr, 155 uint64_t StartingOffset = 0) { 156 SmallVector<EVT, 16> TempVTs; 157 SmallVector<uint64_t, 16> TempOffsets; 158 159 // Special case for i128 - decompose to (i64, i64) 160 if (Ty->isIntegerTy(128)) { 161 ValueVTs.push_back(EVT(MVT::i64)); 162 ValueVTs.push_back(EVT(MVT::i64)); 163 164 if (Offsets) { 165 Offsets->push_back(StartingOffset + 0); 166 Offsets->push_back(StartingOffset + 8); 167 } 168 169 return; 170 } 171 172 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 173 if (StructType *STy = dyn_cast<StructType>(Ty)) { 174 auto const *SL = DL.getStructLayout(STy); 175 auto ElementNum = 0; 176 for(auto *EI : STy->elements()) { 177 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 178 StartingOffset + SL->getElementOffset(ElementNum)); 179 ++ElementNum; 180 } 181 return; 182 } 183 184 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 185 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 186 EVT VT = TempVTs[i]; 187 uint64_t Off = TempOffsets[i]; 188 // Split vectors into individual elements, except for v2f16, which 189 // we will pass as a single scalar. 190 if (VT.isVector()) { 191 unsigned NumElts = VT.getVectorNumElements(); 192 EVT EltVT = VT.getVectorElementType(); 193 // Vectors with an even number of f16 elements will be passed to 194 // us as an array of v2f16/v2bf16 elements. We must match this so we 195 // stay in sync with Ins/Outs. 196 if ((EltVT == MVT::f16 || EltVT == MVT::bf16) && NumElts % 2 == 0) { 197 EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16; 198 NumElts /= 2; 199 } 200 for (unsigned j = 0; j != NumElts; ++j) { 201 ValueVTs.push_back(EltVT); 202 if (Offsets) 203 Offsets->push_back(Off + j * EltVT.getStoreSize()); 204 } 205 } else { 206 ValueVTs.push_back(VT); 207 if (Offsets) 208 Offsets->push_back(Off); 209 } 210 } 211 } 212 213 /// PromoteScalarIntegerPTX 214 /// Used to make sure the arguments/returns are suitable for passing 215 /// and promote them to a larger size if they're not. 216 /// 217 /// The promoted type is placed in \p PromoteVT if the function returns true. 218 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 219 if (VT.isScalarInteger()) { 220 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 221 default: 222 llvm_unreachable( 223 "Promotion is not suitable for scalars of size larger than 64-bits"); 224 case 1: 225 *PromotedVT = MVT::i1; 226 break; 227 case 2: 228 case 4: 229 case 8: 230 *PromotedVT = MVT::i8; 231 break; 232 case 16: 233 *PromotedVT = MVT::i16; 234 break; 235 case 32: 236 *PromotedVT = MVT::i32; 237 break; 238 case 64: 239 *PromotedVT = MVT::i64; 240 break; 241 } 242 return EVT(*PromotedVT) != VT; 243 } 244 return false; 245 } 246 247 // Check whether we can merge loads/stores of some of the pieces of a 248 // flattened function parameter or return value into a single vector 249 // load/store. 250 // 251 // The flattened parameter is represented as a list of EVTs and 252 // offsets, and the whole structure is aligned to ParamAlignment. This 253 // function determines whether we can load/store pieces of the 254 // parameter starting at index Idx using a single vectorized op of 255 // size AccessSize. If so, it returns the number of param pieces 256 // covered by the vector op. Otherwise, it returns 1. 257 static unsigned CanMergeParamLoadStoresStartingAt( 258 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 259 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 260 261 // Can't vectorize if param alignment is not sufficient. 262 if (ParamAlignment < AccessSize) 263 return 1; 264 // Can't vectorize if offset is not aligned. 265 if (Offsets[Idx] & (AccessSize - 1)) 266 return 1; 267 268 EVT EltVT = ValueVTs[Idx]; 269 unsigned EltSize = EltVT.getStoreSize(); 270 271 // Element is too large to vectorize. 272 if (EltSize >= AccessSize) 273 return 1; 274 275 unsigned NumElts = AccessSize / EltSize; 276 // Can't vectorize if AccessBytes if not a multiple of EltSize. 277 if (AccessSize != EltSize * NumElts) 278 return 1; 279 280 // We don't have enough elements to vectorize. 281 if (Idx + NumElts > ValueVTs.size()) 282 return 1; 283 284 // PTX ISA can only deal with 2- and 4-element vector ops. 285 if (NumElts != 4 && NumElts != 2) 286 return 1; 287 288 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 289 // Types do not match. 290 if (ValueVTs[j] != EltVT) 291 return 1; 292 293 // Elements are not contiguous. 294 if (Offsets[j] - Offsets[j - 1] != EltSize) 295 return 1; 296 } 297 // OK. We can vectorize ValueVTs[i..i+NumElts) 298 return NumElts; 299 } 300 301 // Flags for tracking per-element vectorization state of loads/stores 302 // of a flattened function parameter or return value. 303 enum ParamVectorizationFlags { 304 PVF_INNER = 0x0, // Middle elements of a vector. 305 PVF_FIRST = 0x1, // First element of the vector. 306 PVF_LAST = 0x2, // Last element of the vector. 307 // Scalar is effectively a 1-element vector. 308 PVF_SCALAR = PVF_FIRST | PVF_LAST 309 }; 310 311 // Computes whether and how we can vectorize the loads/stores of a 312 // flattened function parameter or return value. 313 // 314 // The flattened parameter is represented as the list of ValueVTs and 315 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 316 // of the same size as ValueVTs indicating how each piece should be 317 // loaded/stored (i.e. as a scalar, or as part of a vector 318 // load/store). 319 static SmallVector<ParamVectorizationFlags, 16> 320 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 321 const SmallVectorImpl<uint64_t> &Offsets, 322 Align ParamAlignment, bool IsVAArg = false) { 323 // Set vector size to match ValueVTs and mark all elements as 324 // scalars by default. 325 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 326 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 327 328 if (IsVAArg) 329 return VectorInfo; 330 331 // Check what we can vectorize using 128/64/32-bit accesses. 332 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 333 // Skip elements we've already processed. 334 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 335 for (unsigned AccessSize : {16, 8, 4, 2}) { 336 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 337 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 338 // Mark vectorized elements. 339 switch (NumElts) { 340 default: 341 llvm_unreachable("Unexpected return value"); 342 case 1: 343 // Can't vectorize using this size, try next smaller size. 344 continue; 345 case 2: 346 assert(I + 1 < E && "Not enough elements."); 347 VectorInfo[I] = PVF_FIRST; 348 VectorInfo[I + 1] = PVF_LAST; 349 I += 1; 350 break; 351 case 4: 352 assert(I + 3 < E && "Not enough elements."); 353 VectorInfo[I] = PVF_FIRST; 354 VectorInfo[I + 1] = PVF_INNER; 355 VectorInfo[I + 2] = PVF_INNER; 356 VectorInfo[I + 3] = PVF_LAST; 357 I += 3; 358 break; 359 } 360 // Break out of the inner loop because we've already succeeded 361 // using largest possible AccessSize. 362 break; 363 } 364 } 365 return VectorInfo; 366 } 367 368 // NVPTXTargetLowering Constructor. 369 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 370 const NVPTXSubtarget &STI) 371 : TargetLowering(TM), nvTM(&TM), STI(STI) { 372 // always lower memset, memcpy, and memmove intrinsics to load/store 373 // instructions, rather 374 // then generating calls to memset, mempcy or memmove. 375 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 376 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 377 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 378 379 setBooleanContents(ZeroOrNegativeOneBooleanContent); 380 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 381 382 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 383 // condition branches. 384 setJumpIsExpensive(true); 385 386 // Wide divides are _very_ slow. Try to reduce the width of the divide if 387 // possible. 388 addBypassSlowDiv(64, 32); 389 390 // By default, use the Source scheduling 391 if (sched4reg) 392 setSchedulingPreference(Sched::RegPressure); 393 else 394 setSchedulingPreference(Sched::Source); 395 396 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 397 LegalizeAction NoF16Action) { 398 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 399 }; 400 401 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 402 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 403 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 404 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 405 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 406 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 407 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 408 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 409 addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass); 410 addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass); 411 412 // Conversion to/from FP16/FP16x2 is always legal. 413 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 414 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 415 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 416 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 417 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 418 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 419 420 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 421 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 422 423 // Operations not directly supported by NVPTX. 424 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 425 MVT::i16, MVT::i32, MVT::i64}) { 426 setOperationAction(ISD::SELECT_CC, VT, Expand); 427 setOperationAction(ISD::BR_CC, VT, Expand); 428 } 429 430 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 431 // For others we will expand to a SHL/SRA pair. 432 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 433 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 434 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 435 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 436 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 437 438 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 439 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 440 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 441 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 442 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 443 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 444 445 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 446 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 447 448 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 449 // that don't have h/w rotation we lower them to multi-instruction assembly. 450 // See ROT*_sw in NVPTXIntrInfo.td 451 setOperationAction(ISD::ROTL, MVT::i64, Legal); 452 setOperationAction(ISD::ROTR, MVT::i64, Legal); 453 setOperationAction(ISD::ROTL, MVT::i32, Legal); 454 setOperationAction(ISD::ROTR, MVT::i32, Legal); 455 456 setOperationAction(ISD::ROTL, MVT::i16, Expand); 457 setOperationAction(ISD::ROTR, MVT::i16, Expand); 458 setOperationAction(ISD::ROTL, MVT::i8, Expand); 459 setOperationAction(ISD::ROTR, MVT::i8, Expand); 460 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 461 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 462 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 463 464 // Indirect branch is not supported. 465 // This also disables Jump Table creation. 466 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 467 setOperationAction(ISD::BRIND, MVT::Other, Expand); 468 469 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 470 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 471 472 // We want to legalize constant related memmove and memcopy 473 // intrinsics. 474 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 475 476 // Turn FP extload into load/fpextend 477 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 478 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 479 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 480 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 481 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 482 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 483 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 484 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 485 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 486 // Turn FP truncstore into trunc + store. 487 // FIXME: vector types should also be expanded 488 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 489 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 490 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 491 492 // PTX does not support load / store predicate registers 493 setOperationAction(ISD::LOAD, MVT::i1, Custom); 494 setOperationAction(ISD::STORE, MVT::i1, Custom); 495 496 for (MVT VT : MVT::integer_valuetypes()) { 497 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 498 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 499 setTruncStoreAction(VT, MVT::i1, Expand); 500 } 501 502 // This is legal in NVPTX 503 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 504 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 505 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 506 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 507 508 // TRAP can be lowered to PTX trap 509 setOperationAction(ISD::TRAP, MVT::Other, Legal); 510 511 // Register custom handling for vector loads/stores 512 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 513 if (IsPTXVectorType(VT)) { 514 setOperationAction(ISD::LOAD, VT, Custom); 515 setOperationAction(ISD::STORE, VT, Custom); 516 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 517 } 518 } 519 520 // Support varargs. 521 setOperationAction(ISD::VASTART, MVT::Other, Custom); 522 setOperationAction(ISD::VAARG, MVT::Other, Custom); 523 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 524 setOperationAction(ISD::VAEND, MVT::Other, Expand); 525 526 // Custom handling for i8 intrinsics 527 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 528 529 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 530 setOperationAction(ISD::ABS, Ty, Legal); 531 setOperationAction(ISD::SMIN, Ty, Legal); 532 setOperationAction(ISD::SMAX, Ty, Legal); 533 setOperationAction(ISD::UMIN, Ty, Legal); 534 setOperationAction(ISD::UMAX, Ty, Legal); 535 536 setOperationAction(ISD::CTPOP, Ty, Legal); 537 setOperationAction(ISD::CTLZ, Ty, Legal); 538 } 539 540 setOperationAction(ISD::ADDC, MVT::i32, Legal); 541 setOperationAction(ISD::ADDE, MVT::i32, Legal); 542 setOperationAction(ISD::SUBC, MVT::i32, Legal); 543 setOperationAction(ISD::SUBE, MVT::i32, Legal); 544 if (STI.getPTXVersion() >= 43) { 545 setOperationAction(ISD::ADDC, MVT::i64, Legal); 546 setOperationAction(ISD::ADDE, MVT::i64, Legal); 547 setOperationAction(ISD::SUBC, MVT::i64, Legal); 548 setOperationAction(ISD::SUBE, MVT::i64, Legal); 549 } 550 551 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 552 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 553 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 554 555 // PTX does not directly support SELP of i1, so promote to i32 first 556 setOperationAction(ISD::SELECT, MVT::i1, Custom); 557 558 // PTX cannot multiply two i64s in a single instruction. 559 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 560 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 561 562 // We have some custom DAG combine patterns for these nodes 563 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL, 564 ISD::SREM, ISD::UREM}); 565 566 // setcc for f16x2 needs special handling to prevent legalizer's 567 // attempt to scalarize it due to v2i1 not being legal. 568 if (STI.allowFP16Math()) 569 setTargetDAGCombine(ISD::SETCC); 570 571 // Promote fp16 arithmetic if fp16 hardware isn't available or the 572 // user passed --nvptx-no-fp16-math. The flag is useful because, 573 // although sm_53+ GPUs have some sort of FP16 support in 574 // hardware, only sm_53 and sm_60 have full implementation. Others 575 // only have token amount of hardware and are likely to run faster 576 // by using fp32 units instead. 577 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 578 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 579 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 580 } 581 582 // f16/f16x2 neg was introduced in PTX 60, SM_53. 583 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 584 STI.getPTXVersion() >= 60 && 585 STI.allowFP16Math(); 586 for (const auto &VT : {MVT::f16, MVT::v2f16}) 587 setOperationAction(ISD::FNEG, VT, 588 IsFP16FP16x2NegAvailable ? Legal : Expand); 589 590 // (would be) Library functions. 591 592 // These map to conversion instructions for scalar FP types. 593 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 594 ISD::FROUNDEVEN, ISD::FTRUNC}) { 595 setOperationAction(Op, MVT::f16, Legal); 596 setOperationAction(Op, MVT::f32, Legal); 597 setOperationAction(Op, MVT::f64, Legal); 598 setOperationAction(Op, MVT::v2f16, Expand); 599 } 600 601 setOperationAction(ISD::FROUND, MVT::f16, Promote); 602 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 603 setOperationAction(ISD::FROUND, MVT::f32, Custom); 604 setOperationAction(ISD::FROUND, MVT::f64, Custom); 605 606 607 // 'Expand' implements FCOPYSIGN without calling an external library. 608 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 609 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 610 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 611 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 612 613 // These map to corresponding instructions for f32/f64. f16 must be 614 // promoted to f32. v2f16 is expanded to f16, which is then promoted 615 // to f32. 616 for (const auto &Op : 617 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) { 618 setOperationAction(Op, MVT::f16, Promote); 619 setOperationAction(Op, MVT::f32, Legal); 620 setOperationAction(Op, MVT::f64, Legal); 621 setOperationAction(Op, MVT::v2f16, Expand); 622 } 623 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 624 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 625 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 626 return IsAtLeastSm80 ? Legal : NotSm80Action; 627 }; 628 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 629 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 630 setOperationAction(Op, MVT::f32, Legal); 631 setOperationAction(Op, MVT::f64, Legal); 632 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 633 } 634 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 635 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 636 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 637 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 638 } 639 640 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 641 // No FPOW or FREM in PTX. 642 643 // Now deduce the information based on the above mentioned 644 // actions 645 computeRegisterProperties(STI.getRegisterInfo()); 646 647 setMinCmpXchgSizeInBits(32); 648 } 649 650 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 651 switch ((NVPTXISD::NodeType)Opcode) { 652 case NVPTXISD::FIRST_NUMBER: 653 break; 654 case NVPTXISD::CALL: 655 return "NVPTXISD::CALL"; 656 case NVPTXISD::RET_FLAG: 657 return "NVPTXISD::RET_FLAG"; 658 case NVPTXISD::LOAD_PARAM: 659 return "NVPTXISD::LOAD_PARAM"; 660 case NVPTXISD::Wrapper: 661 return "NVPTXISD::Wrapper"; 662 case NVPTXISD::DeclareParam: 663 return "NVPTXISD::DeclareParam"; 664 case NVPTXISD::DeclareScalarParam: 665 return "NVPTXISD::DeclareScalarParam"; 666 case NVPTXISD::DeclareRet: 667 return "NVPTXISD::DeclareRet"; 668 case NVPTXISD::DeclareScalarRet: 669 return "NVPTXISD::DeclareScalarRet"; 670 case NVPTXISD::DeclareRetParam: 671 return "NVPTXISD::DeclareRetParam"; 672 case NVPTXISD::PrintCall: 673 return "NVPTXISD::PrintCall"; 674 case NVPTXISD::PrintConvergentCall: 675 return "NVPTXISD::PrintConvergentCall"; 676 case NVPTXISD::PrintCallUni: 677 return "NVPTXISD::PrintCallUni"; 678 case NVPTXISD::PrintConvergentCallUni: 679 return "NVPTXISD::PrintConvergentCallUni"; 680 case NVPTXISD::LoadParam: 681 return "NVPTXISD::LoadParam"; 682 case NVPTXISD::LoadParamV2: 683 return "NVPTXISD::LoadParamV2"; 684 case NVPTXISD::LoadParamV4: 685 return "NVPTXISD::LoadParamV4"; 686 case NVPTXISD::StoreParam: 687 return "NVPTXISD::StoreParam"; 688 case NVPTXISD::StoreParamV2: 689 return "NVPTXISD::StoreParamV2"; 690 case NVPTXISD::StoreParamV4: 691 return "NVPTXISD::StoreParamV4"; 692 case NVPTXISD::StoreParamS32: 693 return "NVPTXISD::StoreParamS32"; 694 case NVPTXISD::StoreParamU32: 695 return "NVPTXISD::StoreParamU32"; 696 case NVPTXISD::CallArgBegin: 697 return "NVPTXISD::CallArgBegin"; 698 case NVPTXISD::CallArg: 699 return "NVPTXISD::CallArg"; 700 case NVPTXISD::LastCallArg: 701 return "NVPTXISD::LastCallArg"; 702 case NVPTXISD::CallArgEnd: 703 return "NVPTXISD::CallArgEnd"; 704 case NVPTXISD::CallVoid: 705 return "NVPTXISD::CallVoid"; 706 case NVPTXISD::CallVal: 707 return "NVPTXISD::CallVal"; 708 case NVPTXISD::CallSymbol: 709 return "NVPTXISD::CallSymbol"; 710 case NVPTXISD::Prototype: 711 return "NVPTXISD::Prototype"; 712 case NVPTXISD::MoveParam: 713 return "NVPTXISD::MoveParam"; 714 case NVPTXISD::StoreRetval: 715 return "NVPTXISD::StoreRetval"; 716 case NVPTXISD::StoreRetvalV2: 717 return "NVPTXISD::StoreRetvalV2"; 718 case NVPTXISD::StoreRetvalV4: 719 return "NVPTXISD::StoreRetvalV4"; 720 case NVPTXISD::PseudoUseParam: 721 return "NVPTXISD::PseudoUseParam"; 722 case NVPTXISD::RETURN: 723 return "NVPTXISD::RETURN"; 724 case NVPTXISD::CallSeqBegin: 725 return "NVPTXISD::CallSeqBegin"; 726 case NVPTXISD::CallSeqEnd: 727 return "NVPTXISD::CallSeqEnd"; 728 case NVPTXISD::CallPrototype: 729 return "NVPTXISD::CallPrototype"; 730 case NVPTXISD::ProxyReg: 731 return "NVPTXISD::ProxyReg"; 732 case NVPTXISD::LoadV2: 733 return "NVPTXISD::LoadV2"; 734 case NVPTXISD::LoadV4: 735 return "NVPTXISD::LoadV4"; 736 case NVPTXISD::LDGV2: 737 return "NVPTXISD::LDGV2"; 738 case NVPTXISD::LDGV4: 739 return "NVPTXISD::LDGV4"; 740 case NVPTXISD::LDUV2: 741 return "NVPTXISD::LDUV2"; 742 case NVPTXISD::LDUV4: 743 return "NVPTXISD::LDUV4"; 744 case NVPTXISD::StoreV2: 745 return "NVPTXISD::StoreV2"; 746 case NVPTXISD::StoreV4: 747 return "NVPTXISD::StoreV4"; 748 case NVPTXISD::FUN_SHFL_CLAMP: 749 return "NVPTXISD::FUN_SHFL_CLAMP"; 750 case NVPTXISD::FUN_SHFR_CLAMP: 751 return "NVPTXISD::FUN_SHFR_CLAMP"; 752 case NVPTXISD::IMAD: 753 return "NVPTXISD::IMAD"; 754 case NVPTXISD::SETP_F16X2: 755 return "NVPTXISD::SETP_F16X2"; 756 case NVPTXISD::Dummy: 757 return "NVPTXISD::Dummy"; 758 case NVPTXISD::MUL_WIDE_SIGNED: 759 return "NVPTXISD::MUL_WIDE_SIGNED"; 760 case NVPTXISD::MUL_WIDE_UNSIGNED: 761 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 762 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 763 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 764 case NVPTXISD::Tex1DFloatFloatLevel: 765 return "NVPTXISD::Tex1DFloatFloatLevel"; 766 case NVPTXISD::Tex1DFloatFloatGrad: 767 return "NVPTXISD::Tex1DFloatFloatGrad"; 768 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 769 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 770 case NVPTXISD::Tex1DS32FloatLevel: 771 return "NVPTXISD::Tex1DS32FloatLevel"; 772 case NVPTXISD::Tex1DS32FloatGrad: 773 return "NVPTXISD::Tex1DS32FloatGrad"; 774 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 775 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 776 case NVPTXISD::Tex1DU32FloatLevel: 777 return "NVPTXISD::Tex1DU32FloatLevel"; 778 case NVPTXISD::Tex1DU32FloatGrad: 779 return "NVPTXISD::Tex1DU32FloatGrad"; 780 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 781 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 782 case NVPTXISD::Tex1DArrayFloatFloatLevel: 783 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 784 case NVPTXISD::Tex1DArrayFloatFloatGrad: 785 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 786 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 787 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 788 case NVPTXISD::Tex1DArrayS32FloatLevel: 789 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 790 case NVPTXISD::Tex1DArrayS32FloatGrad: 791 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 792 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 793 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 794 case NVPTXISD::Tex1DArrayU32FloatLevel: 795 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 796 case NVPTXISD::Tex1DArrayU32FloatGrad: 797 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 798 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 799 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 800 case NVPTXISD::Tex2DFloatFloatLevel: 801 return "NVPTXISD::Tex2DFloatFloatLevel"; 802 case NVPTXISD::Tex2DFloatFloatGrad: 803 return "NVPTXISD::Tex2DFloatFloatGrad"; 804 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 805 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 806 case NVPTXISD::Tex2DS32FloatLevel: 807 return "NVPTXISD::Tex2DS32FloatLevel"; 808 case NVPTXISD::Tex2DS32FloatGrad: 809 return "NVPTXISD::Tex2DS32FloatGrad"; 810 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 811 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 812 case NVPTXISD::Tex2DU32FloatLevel: 813 return "NVPTXISD::Tex2DU32FloatLevel"; 814 case NVPTXISD::Tex2DU32FloatGrad: 815 return "NVPTXISD::Tex2DU32FloatGrad"; 816 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 817 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 818 case NVPTXISD::Tex2DArrayFloatFloatLevel: 819 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 820 case NVPTXISD::Tex2DArrayFloatFloatGrad: 821 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 822 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 823 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 824 case NVPTXISD::Tex2DArrayS32FloatLevel: 825 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 826 case NVPTXISD::Tex2DArrayS32FloatGrad: 827 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 828 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 829 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 830 case NVPTXISD::Tex2DArrayU32FloatLevel: 831 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 832 case NVPTXISD::Tex2DArrayU32FloatGrad: 833 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 834 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 835 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 836 case NVPTXISD::Tex3DFloatFloatLevel: 837 return "NVPTXISD::Tex3DFloatFloatLevel"; 838 case NVPTXISD::Tex3DFloatFloatGrad: 839 return "NVPTXISD::Tex3DFloatFloatGrad"; 840 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 841 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 842 case NVPTXISD::Tex3DS32FloatLevel: 843 return "NVPTXISD::Tex3DS32FloatLevel"; 844 case NVPTXISD::Tex3DS32FloatGrad: 845 return "NVPTXISD::Tex3DS32FloatGrad"; 846 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 847 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 848 case NVPTXISD::Tex3DU32FloatLevel: 849 return "NVPTXISD::Tex3DU32FloatLevel"; 850 case NVPTXISD::Tex3DU32FloatGrad: 851 return "NVPTXISD::Tex3DU32FloatGrad"; 852 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 853 case NVPTXISD::TexCubeFloatFloatLevel: 854 return "NVPTXISD::TexCubeFloatFloatLevel"; 855 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 856 case NVPTXISD::TexCubeS32FloatLevel: 857 return "NVPTXISD::TexCubeS32FloatLevel"; 858 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 859 case NVPTXISD::TexCubeU32FloatLevel: 860 return "NVPTXISD::TexCubeU32FloatLevel"; 861 case NVPTXISD::TexCubeArrayFloatFloat: 862 return "NVPTXISD::TexCubeArrayFloatFloat"; 863 case NVPTXISD::TexCubeArrayFloatFloatLevel: 864 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 865 case NVPTXISD::TexCubeArrayS32Float: 866 return "NVPTXISD::TexCubeArrayS32Float"; 867 case NVPTXISD::TexCubeArrayS32FloatLevel: 868 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 869 case NVPTXISD::TexCubeArrayU32Float: 870 return "NVPTXISD::TexCubeArrayU32Float"; 871 case NVPTXISD::TexCubeArrayU32FloatLevel: 872 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 873 case NVPTXISD::Tld4R2DFloatFloat: 874 return "NVPTXISD::Tld4R2DFloatFloat"; 875 case NVPTXISD::Tld4G2DFloatFloat: 876 return "NVPTXISD::Tld4G2DFloatFloat"; 877 case NVPTXISD::Tld4B2DFloatFloat: 878 return "NVPTXISD::Tld4B2DFloatFloat"; 879 case NVPTXISD::Tld4A2DFloatFloat: 880 return "NVPTXISD::Tld4A2DFloatFloat"; 881 case NVPTXISD::Tld4R2DS64Float: 882 return "NVPTXISD::Tld4R2DS64Float"; 883 case NVPTXISD::Tld4G2DS64Float: 884 return "NVPTXISD::Tld4G2DS64Float"; 885 case NVPTXISD::Tld4B2DS64Float: 886 return "NVPTXISD::Tld4B2DS64Float"; 887 case NVPTXISD::Tld4A2DS64Float: 888 return "NVPTXISD::Tld4A2DS64Float"; 889 case NVPTXISD::Tld4R2DU64Float: 890 return "NVPTXISD::Tld4R2DU64Float"; 891 case NVPTXISD::Tld4G2DU64Float: 892 return "NVPTXISD::Tld4G2DU64Float"; 893 case NVPTXISD::Tld4B2DU64Float: 894 return "NVPTXISD::Tld4B2DU64Float"; 895 case NVPTXISD::Tld4A2DU64Float: 896 return "NVPTXISD::Tld4A2DU64Float"; 897 898 case NVPTXISD::TexUnified1DFloatS32: 899 return "NVPTXISD::TexUnified1DFloatS32"; 900 case NVPTXISD::TexUnified1DFloatFloat: 901 return "NVPTXISD::TexUnified1DFloatFloat"; 902 case NVPTXISD::TexUnified1DFloatFloatLevel: 903 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 904 case NVPTXISD::TexUnified1DFloatFloatGrad: 905 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 906 case NVPTXISD::TexUnified1DS32S32: 907 return "NVPTXISD::TexUnified1DS32S32"; 908 case NVPTXISD::TexUnified1DS32Float: 909 return "NVPTXISD::TexUnified1DS32Float"; 910 case NVPTXISD::TexUnified1DS32FloatLevel: 911 return "NVPTXISD::TexUnified1DS32FloatLevel"; 912 case NVPTXISD::TexUnified1DS32FloatGrad: 913 return "NVPTXISD::TexUnified1DS32FloatGrad"; 914 case NVPTXISD::TexUnified1DU32S32: 915 return "NVPTXISD::TexUnified1DU32S32"; 916 case NVPTXISD::TexUnified1DU32Float: 917 return "NVPTXISD::TexUnified1DU32Float"; 918 case NVPTXISD::TexUnified1DU32FloatLevel: 919 return "NVPTXISD::TexUnified1DU32FloatLevel"; 920 case NVPTXISD::TexUnified1DU32FloatGrad: 921 return "NVPTXISD::TexUnified1DU32FloatGrad"; 922 case NVPTXISD::TexUnified1DArrayFloatS32: 923 return "NVPTXISD::TexUnified1DArrayFloatS32"; 924 case NVPTXISD::TexUnified1DArrayFloatFloat: 925 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 926 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 927 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 928 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 929 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 930 case NVPTXISD::TexUnified1DArrayS32S32: 931 return "NVPTXISD::TexUnified1DArrayS32S32"; 932 case NVPTXISD::TexUnified1DArrayS32Float: 933 return "NVPTXISD::TexUnified1DArrayS32Float"; 934 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 935 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 936 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 937 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 938 case NVPTXISD::TexUnified1DArrayU32S32: 939 return "NVPTXISD::TexUnified1DArrayU32S32"; 940 case NVPTXISD::TexUnified1DArrayU32Float: 941 return "NVPTXISD::TexUnified1DArrayU32Float"; 942 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 943 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 944 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 945 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 946 case NVPTXISD::TexUnified2DFloatS32: 947 return "NVPTXISD::TexUnified2DFloatS32"; 948 case NVPTXISD::TexUnified2DFloatFloat: 949 return "NVPTXISD::TexUnified2DFloatFloat"; 950 case NVPTXISD::TexUnified2DFloatFloatLevel: 951 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 952 case NVPTXISD::TexUnified2DFloatFloatGrad: 953 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 954 case NVPTXISD::TexUnified2DS32S32: 955 return "NVPTXISD::TexUnified2DS32S32"; 956 case NVPTXISD::TexUnified2DS32Float: 957 return "NVPTXISD::TexUnified2DS32Float"; 958 case NVPTXISD::TexUnified2DS32FloatLevel: 959 return "NVPTXISD::TexUnified2DS32FloatLevel"; 960 case NVPTXISD::TexUnified2DS32FloatGrad: 961 return "NVPTXISD::TexUnified2DS32FloatGrad"; 962 case NVPTXISD::TexUnified2DU32S32: 963 return "NVPTXISD::TexUnified2DU32S32"; 964 case NVPTXISD::TexUnified2DU32Float: 965 return "NVPTXISD::TexUnified2DU32Float"; 966 case NVPTXISD::TexUnified2DU32FloatLevel: 967 return "NVPTXISD::TexUnified2DU32FloatLevel"; 968 case NVPTXISD::TexUnified2DU32FloatGrad: 969 return "NVPTXISD::TexUnified2DU32FloatGrad"; 970 case NVPTXISD::TexUnified2DArrayFloatS32: 971 return "NVPTXISD::TexUnified2DArrayFloatS32"; 972 case NVPTXISD::TexUnified2DArrayFloatFloat: 973 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 974 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 975 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 976 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 977 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 978 case NVPTXISD::TexUnified2DArrayS32S32: 979 return "NVPTXISD::TexUnified2DArrayS32S32"; 980 case NVPTXISD::TexUnified2DArrayS32Float: 981 return "NVPTXISD::TexUnified2DArrayS32Float"; 982 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 983 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 984 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 985 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 986 case NVPTXISD::TexUnified2DArrayU32S32: 987 return "NVPTXISD::TexUnified2DArrayU32S32"; 988 case NVPTXISD::TexUnified2DArrayU32Float: 989 return "NVPTXISD::TexUnified2DArrayU32Float"; 990 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 991 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 992 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 993 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 994 case NVPTXISD::TexUnified3DFloatS32: 995 return "NVPTXISD::TexUnified3DFloatS32"; 996 case NVPTXISD::TexUnified3DFloatFloat: 997 return "NVPTXISD::TexUnified3DFloatFloat"; 998 case NVPTXISD::TexUnified3DFloatFloatLevel: 999 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 1000 case NVPTXISD::TexUnified3DFloatFloatGrad: 1001 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 1002 case NVPTXISD::TexUnified3DS32S32: 1003 return "NVPTXISD::TexUnified3DS32S32"; 1004 case NVPTXISD::TexUnified3DS32Float: 1005 return "NVPTXISD::TexUnified3DS32Float"; 1006 case NVPTXISD::TexUnified3DS32FloatLevel: 1007 return "NVPTXISD::TexUnified3DS32FloatLevel"; 1008 case NVPTXISD::TexUnified3DS32FloatGrad: 1009 return "NVPTXISD::TexUnified3DS32FloatGrad"; 1010 case NVPTXISD::TexUnified3DU32S32: 1011 return "NVPTXISD::TexUnified3DU32S32"; 1012 case NVPTXISD::TexUnified3DU32Float: 1013 return "NVPTXISD::TexUnified3DU32Float"; 1014 case NVPTXISD::TexUnified3DU32FloatLevel: 1015 return "NVPTXISD::TexUnified3DU32FloatLevel"; 1016 case NVPTXISD::TexUnified3DU32FloatGrad: 1017 return "NVPTXISD::TexUnified3DU32FloatGrad"; 1018 case NVPTXISD::TexUnifiedCubeFloatFloat: 1019 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1020 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1021 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1022 case NVPTXISD::TexUnifiedCubeS32Float: 1023 return "NVPTXISD::TexUnifiedCubeS32Float"; 1024 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1025 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1026 case NVPTXISD::TexUnifiedCubeU32Float: 1027 return "NVPTXISD::TexUnifiedCubeU32Float"; 1028 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1029 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1030 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1031 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1032 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1033 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1034 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1035 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1036 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1037 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1038 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1039 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1040 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1041 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1042 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1043 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1044 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1045 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1046 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1047 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1048 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1049 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1050 case NVPTXISD::Tld4UnifiedR2DS64Float: 1051 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1052 case NVPTXISD::Tld4UnifiedG2DS64Float: 1053 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1054 case NVPTXISD::Tld4UnifiedB2DS64Float: 1055 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1056 case NVPTXISD::Tld4UnifiedA2DS64Float: 1057 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1058 case NVPTXISD::Tld4UnifiedR2DU64Float: 1059 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1060 case NVPTXISD::Tld4UnifiedG2DU64Float: 1061 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1062 case NVPTXISD::Tld4UnifiedB2DU64Float: 1063 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1064 case NVPTXISD::Tld4UnifiedA2DU64Float: 1065 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1066 1067 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1068 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1069 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1070 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1071 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1072 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1073 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1074 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1075 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1076 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1077 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1078 1079 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1080 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1081 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1082 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1083 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1084 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1085 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1086 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1087 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1088 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1089 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1090 1091 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1092 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1093 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1094 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1095 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1096 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1097 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1098 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1099 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1100 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1101 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1102 1103 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1104 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1105 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1106 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1107 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1108 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1109 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1110 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1111 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1112 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1113 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1114 1115 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1116 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1117 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1118 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1119 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1120 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1121 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1122 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1123 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1124 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1125 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1126 1127 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1128 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1129 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1130 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1131 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1132 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1133 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1134 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1135 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1136 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1137 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1138 1139 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1140 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1141 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1142 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1143 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1144 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1145 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1146 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1147 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1148 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1149 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1150 1151 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1152 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1153 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1154 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1155 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1156 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1157 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1158 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1159 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1160 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1161 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1162 1163 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1164 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1165 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1166 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1167 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1168 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1169 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1170 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1171 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1172 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1173 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1174 1175 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1176 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1177 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1178 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1179 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1180 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1181 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1182 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1183 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1184 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1185 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1186 1187 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1188 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1189 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1190 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1191 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1192 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1193 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1194 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1195 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1196 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1197 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1198 1199 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1200 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1201 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1202 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1203 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1204 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1205 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1206 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1207 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1208 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1209 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1210 1211 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1212 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1213 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1214 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1215 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1216 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1217 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1218 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1219 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1220 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1221 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1222 1223 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1224 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1225 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1226 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1227 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1228 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1229 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1230 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1231 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1232 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1233 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1234 1235 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1236 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1237 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1238 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1239 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1240 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1241 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1242 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1243 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1244 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1245 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1246 } 1247 return nullptr; 1248 } 1249 1250 TargetLoweringBase::LegalizeTypeAction 1251 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1252 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1253 VT.getScalarType() == MVT::i1) 1254 return TypeSplitVector; 1255 if (VT == MVT::v2f16) 1256 return TypeLegal; 1257 return TargetLoweringBase::getPreferredVectorAction(VT); 1258 } 1259 1260 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1261 int Enabled, int &ExtraSteps, 1262 bool &UseOneConst, 1263 bool Reciprocal) const { 1264 if (!(Enabled == ReciprocalEstimate::Enabled || 1265 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1266 return SDValue(); 1267 1268 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1269 ExtraSteps = 0; 1270 1271 SDLoc DL(Operand); 1272 EVT VT = Operand.getValueType(); 1273 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1274 1275 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1276 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1277 DAG.getConstant(IID, DL, MVT::i32), Operand); 1278 }; 1279 1280 // The sqrt and rsqrt refinement processes assume we always start out with an 1281 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1282 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1283 // any refinement, we must return a regular sqrt. 1284 if (Reciprocal || ExtraSteps > 0) { 1285 if (VT == MVT::f32) 1286 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1287 : Intrinsic::nvvm_rsqrt_approx_f); 1288 else if (VT == MVT::f64) 1289 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1290 else 1291 return SDValue(); 1292 } else { 1293 if (VT == MVT::f32) 1294 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1295 : Intrinsic::nvvm_sqrt_approx_f); 1296 else { 1297 // There's no sqrt.approx.f64 instruction, so we emit 1298 // reciprocal(rsqrt(x)). This is faster than 1299 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1300 // x * rsqrt(x).) 1301 return DAG.getNode( 1302 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1303 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1304 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1305 } 1306 } 1307 } 1308 1309 SDValue 1310 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1311 SDLoc dl(Op); 1312 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1313 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1314 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1315 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1316 } 1317 1318 std::string NVPTXTargetLowering::getPrototype( 1319 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1320 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1321 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1322 const CallBase &CB, unsigned UniqueCallSite) const { 1323 auto PtrVT = getPointerTy(DL); 1324 1325 bool isABI = (STI.getSmVersion() >= 20); 1326 assert(isABI && "Non-ABI compilation is not supported"); 1327 if (!isABI) 1328 return ""; 1329 1330 std::string Prototype; 1331 raw_string_ostream O(Prototype); 1332 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1333 1334 if (retTy->getTypeID() == Type::VoidTyID) { 1335 O << "()"; 1336 } else { 1337 O << "("; 1338 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1339 unsigned size = 0; 1340 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1341 size = ITy->getBitWidth(); 1342 } else { 1343 assert(retTy->isFloatingPointTy() && 1344 "Floating point type expected here"); 1345 size = retTy->getPrimitiveSizeInBits(); 1346 } 1347 // PTX ABI requires all scalar return values to be at least 32 1348 // bits in size. fp16 normally uses .b16 as its storage type in 1349 // PTX, so its size must be adjusted here, too. 1350 size = promoteScalarArgumentSize(size); 1351 1352 O << ".param .b" << size << " _"; 1353 } else if (isa<PointerType>(retTy)) { 1354 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1355 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1356 retTy->isIntegerTy(128)) { 1357 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1358 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1359 } else { 1360 llvm_unreachable("Unknown return type"); 1361 } 1362 O << ") "; 1363 } 1364 O << "_ ("; 1365 1366 bool first = true; 1367 1368 const Function *F = CB.getFunction(); 1369 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1370 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1371 Type *Ty = Args[i].Ty; 1372 if (!first) { 1373 O << ", "; 1374 } 1375 first = false; 1376 1377 if (!Outs[OIdx].Flags.isByVal()) { 1378 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1379 unsigned ParamAlign = 0; 1380 const CallInst *CallI = cast<CallInst>(&CB); 1381 // +1 because index 0 is reserved for return type alignment 1382 if (!getAlign(*CallI, i + 1, ParamAlign)) 1383 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1384 O << ".param .align " << ParamAlign << " .b8 "; 1385 O << "_"; 1386 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1387 // update the index for Outs 1388 SmallVector<EVT, 16> vtparts; 1389 ComputeValueVTs(*this, DL, Ty, vtparts); 1390 if (unsigned len = vtparts.size()) 1391 OIdx += len - 1; 1392 continue; 1393 } 1394 // i8 types in IR will be i16 types in SDAG 1395 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1396 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1397 "type mismatch between callee prototype and arguments"); 1398 // scalar type 1399 unsigned sz = 0; 1400 if (isa<IntegerType>(Ty)) { 1401 sz = cast<IntegerType>(Ty)->getBitWidth(); 1402 sz = promoteScalarArgumentSize(sz); 1403 } else if (isa<PointerType>(Ty)) { 1404 sz = PtrVT.getSizeInBits(); 1405 } else if (Ty->isHalfTy()) 1406 // PTX ABI requires all scalar parameters to be at least 32 1407 // bits in size. fp16 normally uses .b16 as its storage type 1408 // in PTX, so its size must be adjusted here, too. 1409 sz = 32; 1410 else 1411 sz = Ty->getPrimitiveSizeInBits(); 1412 O << ".param .b" << sz << " "; 1413 O << "_"; 1414 continue; 1415 } 1416 1417 Type *ETy = Args[i].IndirectType; 1418 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1419 Align ParamByValAlign = 1420 getFunctionByValParamAlign(F, ETy, InitialAlign, DL); 1421 1422 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1423 O << "_"; 1424 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1425 } 1426 1427 if (VAInfo) 1428 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1429 << " .b8 _[]\n"; 1430 O << ")"; 1431 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1432 O << " .noreturn"; 1433 O << ";"; 1434 1435 return Prototype; 1436 } 1437 1438 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1439 const CallBase *CB, Type *Ty, 1440 unsigned Idx, 1441 const DataLayout &DL) const { 1442 if (!CB) { 1443 // CallSite is zero, fallback to ABI type alignment 1444 return DL.getABITypeAlign(Ty); 1445 } 1446 1447 unsigned Alignment = 0; 1448 const Function *DirectCallee = CB->getCalledFunction(); 1449 1450 if (!DirectCallee) { 1451 // We don't have a direct function symbol, but that may be because of 1452 // constant cast instructions in the call. 1453 1454 // With bitcast'd call targets, the instruction will be the call 1455 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1456 // Check if we have call alignment metadata 1457 if (getAlign(*CI, Idx, Alignment)) 1458 return Align(Alignment); 1459 } 1460 DirectCallee = getMaybeBitcastedCallee(CB); 1461 } 1462 1463 // Check for function alignment information if we found that the 1464 // ultimate target is a Function 1465 if (DirectCallee) { 1466 if (getAlign(*DirectCallee, Idx, Alignment)) 1467 return Align(Alignment); 1468 // If alignment information is not available, fall back to the 1469 // default function param optimized type alignment 1470 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1471 } 1472 1473 // Call is indirect, fall back to the ABI type alignment 1474 return DL.getABITypeAlign(Ty); 1475 } 1476 1477 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1478 SmallVectorImpl<SDValue> &InVals) const { 1479 1480 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1481 report_fatal_error( 1482 "Support for variadic functions (unsized array parameter) introduced " 1483 "in PTX ISA version 6.0 and requires target sm_30."); 1484 1485 SelectionDAG &DAG = CLI.DAG; 1486 SDLoc dl = CLI.DL; 1487 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1488 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1489 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1490 SDValue Chain = CLI.Chain; 1491 SDValue Callee = CLI.Callee; 1492 bool &isTailCall = CLI.IsTailCall; 1493 ArgListTy &Args = CLI.getArgs(); 1494 Type *RetTy = CLI.RetTy; 1495 const CallBase *CB = CLI.CB; 1496 const DataLayout &DL = DAG.getDataLayout(); 1497 1498 bool isABI = (STI.getSmVersion() >= 20); 1499 assert(isABI && "Non-ABI compilation is not supported"); 1500 if (!isABI) 1501 return Chain; 1502 1503 // Variadic arguments. 1504 // 1505 // Normally, for each argument, we declare a param scalar or a param 1506 // byte array in the .param space, and store the argument value to that 1507 // param scalar or array starting at offset 0. 1508 // 1509 // In the case of the first variadic argument, we declare a vararg byte array 1510 // with size 0. The exact size of this array isn't known at this point, so 1511 // it'll be patched later. All the variadic arguments will be stored to this 1512 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1513 // initially set to 0, so it can be used for non-variadic arguments (which use 1514 // 0 offset) to simplify the code. 1515 // 1516 // After all vararg is processed, 'VAOffset' holds the size of the 1517 // vararg byte array. 1518 1519 SDValue VADeclareParam; // vararg byte array 1520 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1521 unsigned VAOffset = 0; // current offset in the param array 1522 1523 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1524 SDValue TempChain = Chain; 1525 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1526 SDValue InFlag = Chain.getValue(1); 1527 1528 unsigned ParamCount = 0; 1529 // Args.size() and Outs.size() need not match. 1530 // Outs.size() will be larger 1531 // * if there is an aggregate argument with multiple fields (each field 1532 // showing up separately in Outs) 1533 // * if there is a vector argument with more than typical vector-length 1534 // elements (generally if more than 4) where each vector element is 1535 // individually present in Outs. 1536 // So a different index should be used for indexing into Outs/OutVals. 1537 // See similar issue in LowerFormalArguments. 1538 unsigned OIdx = 0; 1539 // Declare the .params or .reg need to pass values 1540 // to the function 1541 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1542 EVT VT = Outs[OIdx].VT; 1543 Type *Ty = Args[i].Ty; 1544 bool IsVAArg = (i >= CLI.NumFixedArgs); 1545 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1546 1547 SmallVector<EVT, 16> VTs; 1548 SmallVector<uint64_t, 16> Offsets; 1549 1550 assert((!IsByVal || Args[i].IndirectType) && 1551 "byval arg must have indirect type"); 1552 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1553 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1554 1555 Align ArgAlign; 1556 if (IsByVal) { 1557 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1558 // so we don't need to worry whether it's naturally aligned or not. 1559 // See TargetLowering::LowerCallTo(). 1560 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1561 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1562 InitialAlign, DL); 1563 if (IsVAArg) 1564 VAOffset = alignTo(VAOffset, ArgAlign); 1565 } else { 1566 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); 1567 } 1568 1569 unsigned TypeSize = 1570 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1571 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1572 1573 bool NeedAlign; // Does argument declaration specify alignment? 1574 if (IsVAArg) { 1575 if (ParamCount == FirstVAArg) { 1576 SDValue DeclareParamOps[] = { 1577 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1578 DAG.getConstant(ParamCount, dl, MVT::i32), 1579 DAG.getConstant(1, dl, MVT::i32), InFlag}; 1580 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1581 DeclareParamVTs, DeclareParamOps); 1582 } 1583 NeedAlign = IsByVal || Ty->isAggregateType() || Ty->isVectorTy() || 1584 Ty->isIntegerTy(128); 1585 } else if (IsByVal || Ty->isAggregateType() || Ty->isVectorTy() || 1586 Ty->isIntegerTy(128)) { 1587 // declare .param .align <align> .b8 .param<n>[<size>]; 1588 SDValue DeclareParamOps[] = { 1589 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1590 DAG.getConstant(ParamCount, dl, MVT::i32), 1591 DAG.getConstant(TypeSize, dl, MVT::i32), InFlag}; 1592 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1593 DeclareParamOps); 1594 NeedAlign = true; 1595 } else { 1596 // declare .param .b<size> .param<n>; 1597 if (VT.isInteger() || VT.isFloatingPoint()) { 1598 // PTX ABI requires integral types to be at least 32 bits in 1599 // size. FP16 is loaded/stored using i16, so it's handled 1600 // here as well. 1601 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1602 } 1603 SDValue DeclareScalarParamOps[] = { 1604 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1605 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1606 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1607 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1608 DeclareScalarParamOps); 1609 NeedAlign = false; 1610 } 1611 InFlag = Chain.getValue(1); 1612 1613 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1614 // than 32-bits are sign extended or zero extended, depending on 1615 // whether they are signed or unsigned types. This case applies 1616 // only to scalar parameters and not to aggregate values. 1617 bool ExtendIntegerParam = 1618 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1619 1620 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1621 SmallVector<SDValue, 6> StoreOperands; 1622 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1623 EVT EltVT = VTs[j]; 1624 int CurOffset = Offsets[j]; 1625 MaybeAlign PartAlign; 1626 if (NeedAlign) 1627 PartAlign = commonAlignment(ArgAlign, CurOffset); 1628 1629 // New store. 1630 if (VectorInfo[j] & PVF_FIRST) { 1631 assert(StoreOperands.empty() && "Unfinished preceding store."); 1632 StoreOperands.push_back(Chain); 1633 StoreOperands.push_back( 1634 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1635 StoreOperands.push_back(DAG.getConstant( 1636 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1637 dl, MVT::i32)); 1638 } 1639 1640 SDValue StVal = OutVals[OIdx]; 1641 1642 MVT PromotedVT; 1643 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1644 EltVT = EVT(PromotedVT); 1645 } 1646 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1647 llvm::ISD::NodeType Ext = 1648 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1649 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1650 } 1651 1652 if (IsByVal) { 1653 auto PtrVT = getPointerTy(DL); 1654 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1655 DAG.getConstant(CurOffset, dl, PtrVT)); 1656 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1657 PartAlign); 1658 } else if (ExtendIntegerParam) { 1659 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1660 // zext/sext to i32 1661 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1662 : ISD::ZERO_EXTEND, 1663 dl, MVT::i32, StVal); 1664 } 1665 1666 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1667 // Use 16-bit registers for small stores as it's the 1668 // smallest general purpose register size supported by NVPTX. 1669 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1670 } 1671 1672 // Record the value to store. 1673 StoreOperands.push_back(StVal); 1674 1675 if (VectorInfo[j] & PVF_LAST) { 1676 unsigned NumElts = StoreOperands.size() - 3; 1677 NVPTXISD::NodeType Op; 1678 switch (NumElts) { 1679 case 1: 1680 Op = NVPTXISD::StoreParam; 1681 break; 1682 case 2: 1683 Op = NVPTXISD::StoreParamV2; 1684 break; 1685 case 4: 1686 Op = NVPTXISD::StoreParamV4; 1687 break; 1688 default: 1689 llvm_unreachable("Invalid vector info."); 1690 } 1691 1692 StoreOperands.push_back(InFlag); 1693 1694 // Adjust type of the store op if we've extended the scalar 1695 // return value. 1696 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1697 1698 Chain = DAG.getMemIntrinsicNode( 1699 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1700 TheStoreType, MachinePointerInfo(), PartAlign, 1701 MachineMemOperand::MOStore); 1702 InFlag = Chain.getValue(1); 1703 1704 // Cleanup. 1705 StoreOperands.clear(); 1706 1707 // TODO: We may need to support vector types that can be passed 1708 // as scalars in variadic arguments. 1709 if (!IsByVal && IsVAArg) { 1710 assert(NumElts == 1 && 1711 "Vectorization is expected to be disabled for variadics."); 1712 VAOffset += DL.getTypeAllocSize( 1713 TheStoreType.getTypeForEVT(*DAG.getContext())); 1714 } 1715 } 1716 if (!IsByVal) 1717 ++OIdx; 1718 } 1719 assert(StoreOperands.empty() && "Unfinished parameter store."); 1720 if (!IsByVal && VTs.size() > 0) 1721 --OIdx; 1722 ++ParamCount; 1723 if (IsByVal && IsVAArg) 1724 VAOffset += TypeSize; 1725 } 1726 1727 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1728 MaybeAlign retAlignment = std::nullopt; 1729 1730 // Handle Result 1731 if (Ins.size() > 0) { 1732 SmallVector<EVT, 16> resvtparts; 1733 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1734 1735 // Declare 1736 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1737 // .param .b<size-in-bits> retval0 1738 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1739 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1740 // these three types to match the logic in 1741 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1742 // Plus, this behavior is consistent with nvcc's. 1743 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1744 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1745 resultsz = promoteScalarArgumentSize(resultsz); 1746 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1747 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1748 DAG.getConstant(resultsz, dl, MVT::i32), 1749 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1750 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1751 DeclareRetOps); 1752 InFlag = Chain.getValue(1); 1753 } else { 1754 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1755 assert(retAlignment && "retAlignment is guaranteed to be set"); 1756 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1757 SDValue DeclareRetOps[] = { 1758 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1759 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1760 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1761 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1762 DeclareRetOps); 1763 InFlag = Chain.getValue(1); 1764 } 1765 } 1766 1767 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1768 // Set the size of the vararg param byte array if the callee is a variadic 1769 // function and the variadic part is not empty. 1770 if (HasVAArgs) { 1771 SDValue DeclareParamOps[] = { 1772 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1773 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1774 VADeclareParam.getOperand(4)}; 1775 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1776 VADeclareParam->getVTList(), DeclareParamOps); 1777 } 1778 1779 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1780 // between them we must rely on the call site value which is valid for 1781 // indirect calls but is always null for libcalls. 1782 bool isIndirectCall = !Func && CB; 1783 1784 if (isa<ExternalSymbolSDNode>(Callee)) { 1785 Function* CalleeFunc = nullptr; 1786 1787 // Try to find the callee in the current module. 1788 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1789 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1790 1791 // Set the "libcall callee" attribute to indicate that the function 1792 // must always have a declaration. 1793 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1794 } 1795 1796 if (isIndirectCall) { 1797 // This is indirect function call case : PTX requires a prototype of the 1798 // form 1799 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1800 // to be emitted, and the label has to used as the last arg of call 1801 // instruction. 1802 // The prototype is embedded in a string and put as the operand for a 1803 // CallPrototype SDNode which will print out to the value of the string. 1804 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1805 std::string Proto = getPrototype( 1806 DL, RetTy, Args, Outs, retAlignment, 1807 HasVAArgs 1808 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 1809 CLI.NumFixedArgs, 1810 cast<ConstantSDNode>(VADeclareParam->getOperand(1)) 1811 ->getAPIntValue())) 1812 : std::nullopt, 1813 *CB, UniqueCallSite); 1814 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 1815 SDValue ProtoOps[] = { 1816 Chain, 1817 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 1818 InFlag, 1819 }; 1820 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1821 InFlag = Chain.getValue(1); 1822 } 1823 // Op to just print "call" 1824 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1825 SDValue PrintCallOps[] = { 1826 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1827 }; 1828 // We model convergent calls as separate opcodes. 1829 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1830 if (CLI.IsConvergent) 1831 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1832 : NVPTXISD::PrintConvergentCall; 1833 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1834 InFlag = Chain.getValue(1); 1835 1836 // Ops to print out the function name 1837 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1838 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1839 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1840 InFlag = Chain.getValue(1); 1841 1842 // Ops to print out the param list 1843 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1844 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1845 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1846 CallArgBeginOps); 1847 InFlag = Chain.getValue(1); 1848 1849 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 1850 ++i) { 1851 unsigned opcode; 1852 if (i == (e - 1)) 1853 opcode = NVPTXISD::LastCallArg; 1854 else 1855 opcode = NVPTXISD::CallArg; 1856 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1857 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1858 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1859 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1860 InFlag = Chain.getValue(1); 1861 } 1862 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1863 SDValue CallArgEndOps[] = { Chain, 1864 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1865 InFlag }; 1866 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1867 InFlag = Chain.getValue(1); 1868 1869 if (isIndirectCall) { 1870 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1871 SDValue PrototypeOps[] = { 1872 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag}; 1873 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1874 InFlag = Chain.getValue(1); 1875 } 1876 1877 SmallVector<SDValue, 16> ProxyRegOps; 1878 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 1879 1880 // Generate loads from param memory/moves from registers for result 1881 if (Ins.size() > 0) { 1882 SmallVector<EVT, 16> VTs; 1883 SmallVector<uint64_t, 16> Offsets; 1884 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1885 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1886 1887 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1888 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1889 1890 SmallVector<EVT, 6> LoadVTs; 1891 int VecIdx = -1; // Index of the first element of the vector. 1892 1893 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1894 // 32-bits are sign extended or zero extended, depending on whether 1895 // they are signed or unsigned types. 1896 bool ExtendIntegerRetVal = 1897 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1898 1899 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1900 bool needTruncate = false; 1901 EVT TheLoadType = VTs[i]; 1902 EVT EltType = Ins[i].VT; 1903 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 1904 MVT PromotedVT; 1905 1906 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 1907 TheLoadType = EVT(PromotedVT); 1908 EltType = EVT(PromotedVT); 1909 needTruncate = true; 1910 } 1911 1912 if (ExtendIntegerRetVal) { 1913 TheLoadType = MVT::i32; 1914 EltType = MVT::i32; 1915 needTruncate = true; 1916 } else if (TheLoadType.getSizeInBits() < 16) { 1917 if (VTs[i].isInteger()) 1918 needTruncate = true; 1919 EltType = MVT::i16; 1920 } 1921 1922 // Record index of the very first element of the vector. 1923 if (VectorInfo[i] & PVF_FIRST) { 1924 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1925 VecIdx = i; 1926 } 1927 1928 LoadVTs.push_back(EltType); 1929 1930 if (VectorInfo[i] & PVF_LAST) { 1931 unsigned NumElts = LoadVTs.size(); 1932 LoadVTs.push_back(MVT::Other); 1933 LoadVTs.push_back(MVT::Glue); 1934 NVPTXISD::NodeType Op; 1935 switch (NumElts) { 1936 case 1: 1937 Op = NVPTXISD::LoadParam; 1938 break; 1939 case 2: 1940 Op = NVPTXISD::LoadParamV2; 1941 break; 1942 case 4: 1943 Op = NVPTXISD::LoadParamV4; 1944 break; 1945 default: 1946 llvm_unreachable("Invalid vector info."); 1947 } 1948 1949 SDValue LoadOperands[] = { 1950 Chain, DAG.getConstant(1, dl, MVT::i32), 1951 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1952 SDValue RetVal = DAG.getMemIntrinsicNode( 1953 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1954 MachinePointerInfo(), EltAlign, 1955 MachineMemOperand::MOLoad); 1956 1957 for (unsigned j = 0; j < NumElts; ++j) { 1958 ProxyRegOps.push_back(RetVal.getValue(j)); 1959 1960 if (needTruncate) 1961 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 1962 else 1963 ProxyRegTruncates.push_back(std::optional<MVT>()); 1964 } 1965 1966 Chain = RetVal.getValue(NumElts); 1967 InFlag = RetVal.getValue(NumElts + 1); 1968 1969 // Cleanup 1970 VecIdx = -1; 1971 LoadVTs.clear(); 1972 } 1973 } 1974 } 1975 1976 Chain = 1977 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InFlag, dl); 1978 InFlag = Chain.getValue(1); 1979 1980 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1981 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1982 // dangling. 1983 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1984 SDValue Ret = DAG.getNode( 1985 NVPTXISD::ProxyReg, dl, 1986 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1987 { Chain, ProxyRegOps[i], InFlag } 1988 ); 1989 1990 Chain = Ret.getValue(1); 1991 InFlag = Ret.getValue(2); 1992 1993 if (ProxyRegTruncates[i]) { 1994 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 1995 } 1996 1997 InVals.push_back(Ret); 1998 } 1999 2000 // set isTailCall to false for now, until we figure out how to express 2001 // tail call optimization in PTX 2002 isTailCall = false; 2003 return Chain; 2004 } 2005 2006 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2007 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2008 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2009 SDValue 2010 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2011 SDNode *Node = Op.getNode(); 2012 SDLoc dl(Node); 2013 SmallVector<SDValue, 8> Ops; 2014 unsigned NumOperands = Node->getNumOperands(); 2015 for (unsigned i = 0; i < NumOperands; ++i) { 2016 SDValue SubOp = Node->getOperand(i); 2017 EVT VVT = SubOp.getNode()->getValueType(0); 2018 EVT EltVT = VVT.getVectorElementType(); 2019 unsigned NumSubElem = VVT.getVectorNumElements(); 2020 for (unsigned j = 0; j < NumSubElem; ++j) { 2021 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2022 DAG.getIntPtrConstant(j, dl))); 2023 } 2024 } 2025 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2026 } 2027 2028 // We can init constant f16x2 with a single .b32 move. Normally it 2029 // would get lowered as two constant loads and vector-packing move. 2030 // mov.b16 %h1, 0x4000; 2031 // mov.b16 %h2, 0x3C00; 2032 // mov.b32 %hh2, {%h2, %h1}; 2033 // Instead we want just a constant move: 2034 // mov.b32 %hh2, 0x40003C00 2035 // 2036 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 2037 // generates good SASS in both cases. 2038 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2039 SelectionDAG &DAG) const { 2040 if (!(Op->getValueType(0) == MVT::v2f16 && 2041 isa<ConstantFPSDNode>(Op->getOperand(0)) && 2042 isa<ConstantFPSDNode>(Op->getOperand(1)))) 2043 return Op; 2044 2045 APInt E0 = 2046 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 2047 APInt E1 = 2048 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 2049 SDValue Const = 2050 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 2051 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 2052 } 2053 2054 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2055 SelectionDAG &DAG) const { 2056 SDValue Index = Op->getOperand(1); 2057 // Constant index will be matched by tablegen. 2058 if (isa<ConstantSDNode>(Index.getNode())) 2059 return Op; 2060 2061 // Extract individual elements and select one of them. 2062 SDValue Vector = Op->getOperand(0); 2063 EVT VectorVT = Vector.getValueType(); 2064 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 2065 EVT EltVT = VectorVT.getVectorElementType(); 2066 2067 SDLoc dl(Op.getNode()); 2068 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2069 DAG.getIntPtrConstant(0, dl)); 2070 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2071 DAG.getIntPtrConstant(1, dl)); 2072 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2073 ISD::CondCode::SETEQ); 2074 } 2075 2076 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2077 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2078 /// amount, or 2079 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2080 /// amount. 2081 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2082 SelectionDAG &DAG) const { 2083 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2084 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2085 2086 EVT VT = Op.getValueType(); 2087 unsigned VTBits = VT.getSizeInBits(); 2088 SDLoc dl(Op); 2089 SDValue ShOpLo = Op.getOperand(0); 2090 SDValue ShOpHi = Op.getOperand(1); 2091 SDValue ShAmt = Op.getOperand(2); 2092 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2093 2094 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2095 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2096 // {dHi, dLo} = {aHi, aLo} >> Amt 2097 // dHi = aHi >> Amt 2098 // dLo = shf.r.clamp aLo, aHi, Amt 2099 2100 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2101 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2102 ShAmt); 2103 2104 SDValue Ops[2] = { Lo, Hi }; 2105 return DAG.getMergeValues(Ops, dl); 2106 } 2107 else { 2108 // {dHi, dLo} = {aHi, aLo} >> Amt 2109 // - if (Amt>=size) then 2110 // dLo = aHi >> (Amt-size) 2111 // dHi = aHi >> Amt (this is either all 0 or all 1) 2112 // else 2113 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2114 // dHi = aHi >> Amt 2115 2116 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2117 DAG.getConstant(VTBits, dl, MVT::i32), 2118 ShAmt); 2119 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2120 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2121 DAG.getConstant(VTBits, dl, MVT::i32)); 2122 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2123 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2124 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2125 2126 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2127 DAG.getConstant(VTBits, dl, MVT::i32), 2128 ISD::SETGE); 2129 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2130 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2131 2132 SDValue Ops[2] = { Lo, Hi }; 2133 return DAG.getMergeValues(Ops, dl); 2134 } 2135 } 2136 2137 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2138 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2139 /// amount, or 2140 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2141 /// amount. 2142 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2143 SelectionDAG &DAG) const { 2144 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2145 assert(Op.getOpcode() == ISD::SHL_PARTS); 2146 2147 EVT VT = Op.getValueType(); 2148 unsigned VTBits = VT.getSizeInBits(); 2149 SDLoc dl(Op); 2150 SDValue ShOpLo = Op.getOperand(0); 2151 SDValue ShOpHi = Op.getOperand(1); 2152 SDValue ShAmt = Op.getOperand(2); 2153 2154 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2155 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2156 // {dHi, dLo} = {aHi, aLo} << Amt 2157 // dHi = shf.l.clamp aLo, aHi, Amt 2158 // dLo = aLo << Amt 2159 2160 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2161 ShAmt); 2162 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2163 2164 SDValue Ops[2] = { Lo, Hi }; 2165 return DAG.getMergeValues(Ops, dl); 2166 } 2167 else { 2168 // {dHi, dLo} = {aHi, aLo} << Amt 2169 // - if (Amt>=size) then 2170 // dLo = aLo << Amt (all 0) 2171 // dLo = aLo << (Amt-size) 2172 // else 2173 // dLo = aLo << Amt 2174 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2175 2176 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2177 DAG.getConstant(VTBits, dl, MVT::i32), 2178 ShAmt); 2179 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2180 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2181 DAG.getConstant(VTBits, dl, MVT::i32)); 2182 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2183 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2184 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2185 2186 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2187 DAG.getConstant(VTBits, dl, MVT::i32), 2188 ISD::SETGE); 2189 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2190 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2191 2192 SDValue Ops[2] = { Lo, Hi }; 2193 return DAG.getMergeValues(Ops, dl); 2194 } 2195 } 2196 2197 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2198 EVT VT = Op.getValueType(); 2199 2200 if (VT == MVT::f32) 2201 return LowerFROUND32(Op, DAG); 2202 2203 if (VT == MVT::f64) 2204 return LowerFROUND64(Op, DAG); 2205 2206 llvm_unreachable("unhandled type"); 2207 } 2208 2209 // This is the the rounding method used in CUDA libdevice in C like code: 2210 // float roundf(float A) 2211 // { 2212 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2213 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2214 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2215 // } 2216 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2217 SelectionDAG &DAG) const { 2218 SDLoc SL(Op); 2219 SDValue A = Op.getOperand(0); 2220 EVT VT = Op.getValueType(); 2221 2222 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2223 2224 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2225 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2226 const int SignBitMask = 0x80000000; 2227 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2228 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2229 const int PointFiveInBits = 0x3F000000; 2230 SDValue PointFiveWithSignRaw = 2231 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2232 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2233 SDValue PointFiveWithSign = 2234 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2235 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2236 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2237 2238 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2239 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2240 SDValue IsLarge = 2241 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2242 ISD::SETOGT); 2243 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2244 2245 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2246 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2247 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2248 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2249 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2250 } 2251 2252 // The implementation of round(double) is similar to that of round(float) in 2253 // that they both separate the value range into three regions and use a method 2254 // specific to the region to round the values. However, round(double) first 2255 // calculates the round of the absolute value and then adds the sign back while 2256 // round(float) directly rounds the value with sign. 2257 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2258 SelectionDAG &DAG) const { 2259 SDLoc SL(Op); 2260 SDValue A = Op.getOperand(0); 2261 EVT VT = Op.getValueType(); 2262 2263 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2264 2265 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2266 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2267 DAG.getConstantFP(0.5, SL, VT)); 2268 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2269 2270 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2271 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2272 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2273 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2274 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2275 DAG.getConstantFP(0, SL, VT), 2276 RoundedA); 2277 2278 // Add sign to rounded_A 2279 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2280 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2281 2282 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2283 SDValue IsLarge = 2284 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2285 ISD::SETOGT); 2286 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2287 } 2288 2289 2290 2291 SDValue 2292 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2293 switch (Op.getOpcode()) { 2294 case ISD::RETURNADDR: 2295 return SDValue(); 2296 case ISD::FRAMEADDR: 2297 return SDValue(); 2298 case ISD::GlobalAddress: 2299 return LowerGlobalAddress(Op, DAG); 2300 case ISD::INTRINSIC_W_CHAIN: 2301 return Op; 2302 case ISD::BUILD_VECTOR: 2303 return LowerBUILD_VECTOR(Op, DAG); 2304 case ISD::EXTRACT_SUBVECTOR: 2305 return Op; 2306 case ISD::EXTRACT_VECTOR_ELT: 2307 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2308 case ISD::CONCAT_VECTORS: 2309 return LowerCONCAT_VECTORS(Op, DAG); 2310 case ISD::STORE: 2311 return LowerSTORE(Op, DAG); 2312 case ISD::LOAD: 2313 return LowerLOAD(Op, DAG); 2314 case ISD::SHL_PARTS: 2315 return LowerShiftLeftParts(Op, DAG); 2316 case ISD::SRA_PARTS: 2317 case ISD::SRL_PARTS: 2318 return LowerShiftRightParts(Op, DAG); 2319 case ISD::SELECT: 2320 return LowerSelect(Op, DAG); 2321 case ISD::FROUND: 2322 return LowerFROUND(Op, DAG); 2323 case ISD::VAARG: 2324 return LowerVAARG(Op, DAG); 2325 case ISD::VASTART: 2326 return LowerVASTART(Op, DAG); 2327 default: 2328 llvm_unreachable("Custom lowering not defined for operation"); 2329 } 2330 } 2331 2332 // This function is almost a copy of SelectionDAG::expandVAArg(). 2333 // The only diff is that this one produces loads from local address space. 2334 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2335 const TargetLowering *TLI = STI.getTargetLowering(); 2336 SDLoc DL(Op); 2337 2338 SDNode *Node = Op.getNode(); 2339 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2340 EVT VT = Node->getValueType(0); 2341 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2342 SDValue Tmp1 = Node->getOperand(0); 2343 SDValue Tmp2 = Node->getOperand(1); 2344 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2345 2346 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2347 Tmp1, Tmp2, MachinePointerInfo(V)); 2348 SDValue VAList = VAListLoad; 2349 2350 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2351 VAList = DAG.getNode( 2352 ISD::ADD, DL, VAList.getValueType(), VAList, 2353 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2354 2355 VAList = DAG.getNode( 2356 ISD::AND, DL, VAList.getValueType(), VAList, 2357 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2358 } 2359 2360 // Increment the pointer, VAList, to the next vaarg 2361 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2362 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2363 DL, VAList.getValueType())); 2364 2365 // Store the incremented VAList to the legalized pointer 2366 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2367 MachinePointerInfo(V)); 2368 2369 const Value *SrcV = 2370 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2371 2372 // Load the actual argument out of the pointer VAList 2373 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2374 } 2375 2376 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2377 const TargetLowering *TLI = STI.getTargetLowering(); 2378 SDLoc DL(Op); 2379 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2380 2381 // Store the address of unsized array <function>_vararg[] in the ap object. 2382 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2383 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2384 2385 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2386 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2387 MachinePointerInfo(SV)); 2388 } 2389 2390 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2391 SDValue Op0 = Op->getOperand(0); 2392 SDValue Op1 = Op->getOperand(1); 2393 SDValue Op2 = Op->getOperand(2); 2394 SDLoc DL(Op.getNode()); 2395 2396 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2397 2398 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2399 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2400 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2401 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2402 2403 return Trunc; 2404 } 2405 2406 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2407 if (Op.getValueType() == MVT::i1) 2408 return LowerLOADi1(Op, DAG); 2409 2410 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2411 // loads and have to handle it here. 2412 if (Op.getValueType() == MVT::v2f16) { 2413 LoadSDNode *Load = cast<LoadSDNode>(Op); 2414 EVT MemVT = Load->getMemoryVT(); 2415 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2416 MemVT, *Load->getMemOperand())) { 2417 SDValue Ops[2]; 2418 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2419 return DAG.getMergeValues(Ops, SDLoc(Op)); 2420 } 2421 } 2422 2423 return SDValue(); 2424 } 2425 2426 // v = ld i1* addr 2427 // => 2428 // v1 = ld i8* addr (-> i16) 2429 // v = trunc i16 to i1 2430 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2431 SDNode *Node = Op.getNode(); 2432 LoadSDNode *LD = cast<LoadSDNode>(Node); 2433 SDLoc dl(Node); 2434 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2435 assert(Node->getValueType(0) == MVT::i1 && 2436 "Custom lowering for i1 load only"); 2437 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2438 LD->getPointerInfo(), LD->getAlign(), 2439 LD->getMemOperand()->getFlags()); 2440 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2441 // The legalizer (the caller) is expecting two values from the legalized 2442 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2443 // in LegalizeDAG.cpp which also uses MergeValues. 2444 SDValue Ops[] = { result, LD->getChain() }; 2445 return DAG.getMergeValues(Ops, dl); 2446 } 2447 2448 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2449 StoreSDNode *Store = cast<StoreSDNode>(Op); 2450 EVT VT = Store->getMemoryVT(); 2451 2452 if (VT == MVT::i1) 2453 return LowerSTOREi1(Op, DAG); 2454 2455 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2456 // stores and have to handle it here. 2457 if (VT == MVT::v2f16 && 2458 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2459 VT, *Store->getMemOperand())) 2460 return expandUnalignedStore(Store, DAG); 2461 2462 if (VT.isVector()) 2463 return LowerSTOREVector(Op, DAG); 2464 2465 return SDValue(); 2466 } 2467 2468 SDValue 2469 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2470 SDNode *N = Op.getNode(); 2471 SDValue Val = N->getOperand(1); 2472 SDLoc DL(N); 2473 EVT ValVT = Val.getValueType(); 2474 2475 if (ValVT.isVector()) { 2476 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2477 // legal. We can (and should) split that into 2 stores of <2 x double> here 2478 // but I'm leaving that as a TODO for now. 2479 if (!ValVT.isSimple()) 2480 return SDValue(); 2481 switch (ValVT.getSimpleVT().SimpleTy) { 2482 default: 2483 return SDValue(); 2484 case MVT::v2i8: 2485 case MVT::v2i16: 2486 case MVT::v2i32: 2487 case MVT::v2i64: 2488 case MVT::v2f16: 2489 case MVT::v2bf16: 2490 case MVT::v2f32: 2491 case MVT::v2f64: 2492 case MVT::v4i8: 2493 case MVT::v4i16: 2494 case MVT::v4i32: 2495 case MVT::v4f16: 2496 case MVT::v4bf16: 2497 case MVT::v4f32: 2498 case MVT::v8f16: // <4 x f16x2> 2499 case MVT::v8bf16: // <4 x bf16x2> 2500 // This is a "native" vector type 2501 break; 2502 } 2503 2504 MemSDNode *MemSD = cast<MemSDNode>(N); 2505 const DataLayout &TD = DAG.getDataLayout(); 2506 2507 Align Alignment = MemSD->getAlign(); 2508 Align PrefAlign = 2509 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2510 if (Alignment < PrefAlign) { 2511 // This store is not sufficiently aligned, so bail out and let this vector 2512 // store be scalarized. Note that we may still be able to emit smaller 2513 // vector stores. For example, if we are storing a <4 x float> with an 2514 // alignment of 8, this check will fail but the legalizer will try again 2515 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2516 return SDValue(); 2517 } 2518 2519 unsigned Opcode = 0; 2520 EVT EltVT = ValVT.getVectorElementType(); 2521 unsigned NumElts = ValVT.getVectorNumElements(); 2522 2523 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2524 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2525 // stored type to i16 and propagate the "real" type as the memory type. 2526 bool NeedExt = false; 2527 if (EltVT.getSizeInBits() < 16) 2528 NeedExt = true; 2529 2530 bool StoreF16x2 = false; 2531 switch (NumElts) { 2532 default: 2533 return SDValue(); 2534 case 2: 2535 Opcode = NVPTXISD::StoreV2; 2536 break; 2537 case 4: 2538 Opcode = NVPTXISD::StoreV4; 2539 break; 2540 case 8: 2541 // v8f16 is a special case. PTX doesn't have st.v8.f16 2542 // instruction. Instead, we split the vector into v2f16 chunks and 2543 // store them with st.v4.b32. 2544 assert((EltVT == MVT::f16 || EltVT == MVT::bf16) && 2545 "Wrong type for the vector."); 2546 Opcode = NVPTXISD::StoreV4; 2547 StoreF16x2 = true; 2548 break; 2549 } 2550 2551 SmallVector<SDValue, 8> Ops; 2552 2553 // First is the chain 2554 Ops.push_back(N->getOperand(0)); 2555 2556 if (StoreF16x2) { 2557 // Combine f16,f16 -> v2f16 2558 NumElts /= 2; 2559 for (unsigned i = 0; i < NumElts; ++i) { 2560 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2561 DAG.getIntPtrConstant(i * 2, DL)); 2562 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2563 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2564 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2565 Ops.push_back(V2); 2566 } 2567 } else { 2568 // Then the split values 2569 for (unsigned i = 0; i < NumElts; ++i) { 2570 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2571 DAG.getIntPtrConstant(i, DL)); 2572 if (NeedExt) 2573 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2574 Ops.push_back(ExtVal); 2575 } 2576 } 2577 2578 // Then any remaining arguments 2579 Ops.append(N->op_begin() + 2, N->op_end()); 2580 2581 SDValue NewSt = 2582 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2583 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2584 2585 // return DCI.CombineTo(N, NewSt, true); 2586 return NewSt; 2587 } 2588 2589 return SDValue(); 2590 } 2591 2592 // st i1 v, addr 2593 // => 2594 // v1 = zxt v to i16 2595 // st.u8 i16, addr 2596 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2597 SDNode *Node = Op.getNode(); 2598 SDLoc dl(Node); 2599 StoreSDNode *ST = cast<StoreSDNode>(Node); 2600 SDValue Tmp1 = ST->getChain(); 2601 SDValue Tmp2 = ST->getBasePtr(); 2602 SDValue Tmp3 = ST->getValue(); 2603 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2604 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2605 SDValue Result = 2606 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2607 ST->getAlign(), ST->getMemOperand()->getFlags()); 2608 return Result; 2609 } 2610 2611 // This creates target external symbol for a function parameter. 2612 // Name of the symbol is composed from its index and the function name. 2613 // Negative index corresponds to special parameter (unsized array) used for 2614 // passing variable arguments. 2615 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 2616 EVT v) const { 2617 std::string ParamSym; 2618 raw_string_ostream ParamStr(ParamSym); 2619 2620 ParamStr << DAG.getMachineFunction().getName(); 2621 2622 if (idx < 0) 2623 ParamStr << "_vararg"; 2624 else 2625 ParamStr << "_param_" << idx; 2626 2627 StringRef SavedStr = 2628 nvTM->getStrPool().save(ParamSym); 2629 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 2630 } 2631 2632 SDValue NVPTXTargetLowering::LowerFormalArguments( 2633 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2634 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2635 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2636 MachineFunction &MF = DAG.getMachineFunction(); 2637 const DataLayout &DL = DAG.getDataLayout(); 2638 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2639 2640 const Function *F = &MF.getFunction(); 2641 const AttributeList &PAL = F->getAttributes(); 2642 const TargetLowering *TLI = STI.getTargetLowering(); 2643 2644 SDValue Root = DAG.getRoot(); 2645 std::vector<SDValue> OutChains; 2646 2647 bool isABI = (STI.getSmVersion() >= 20); 2648 assert(isABI && "Non-ABI compilation is not supported"); 2649 if (!isABI) 2650 return Chain; 2651 2652 std::vector<Type *> argTypes; 2653 std::vector<const Argument *> theArgs; 2654 for (const Argument &I : F->args()) { 2655 theArgs.push_back(&I); 2656 argTypes.push_back(I.getType()); 2657 } 2658 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2659 // Ins.size() will be larger 2660 // * if there is an aggregate argument with multiple fields (each field 2661 // showing up separately in Ins) 2662 // * if there is a vector argument with more than typical vector-length 2663 // elements (generally if more than 4) where each vector element is 2664 // individually present in Ins. 2665 // So a different index should be used for indexing into Ins. 2666 // See similar issue in LowerCall. 2667 unsigned InsIdx = 0; 2668 2669 int idx = 0; 2670 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2671 Type *Ty = argTypes[i]; 2672 2673 if (theArgs[i]->use_empty()) { 2674 // argument is dead 2675 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2676 SmallVector<EVT, 16> vtparts; 2677 2678 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2679 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2680 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2681 ++parti) { 2682 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2683 ++InsIdx; 2684 } 2685 if (vtparts.size() > 0) 2686 --InsIdx; 2687 continue; 2688 } 2689 if (Ty->isVectorTy()) { 2690 EVT ObjectVT = getValueType(DL, Ty); 2691 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2692 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2693 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2694 ++InsIdx; 2695 } 2696 if (NumRegs > 0) 2697 --InsIdx; 2698 continue; 2699 } 2700 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2701 continue; 2702 } 2703 2704 // In the following cases, assign a node order of "idx+1" 2705 // to newly created nodes. The SDNodes for params have to 2706 // appear in the same order as their order of appearance 2707 // in the original function. "idx+1" holds that order. 2708 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 2709 bool aggregateIsPacked = false; 2710 if (StructType *STy = dyn_cast<StructType>(Ty)) 2711 aggregateIsPacked = STy->isPacked(); 2712 2713 SmallVector<EVT, 16> VTs; 2714 SmallVector<uint64_t, 16> Offsets; 2715 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2716 assert(VTs.size() > 0 && "Unexpected empty type."); 2717 auto VectorInfo = 2718 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 2719 2720 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2721 int VecIdx = -1; // Index of the first element of the current vector. 2722 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2723 if (VectorInfo[parti] & PVF_FIRST) { 2724 assert(VecIdx == -1 && "Orphaned vector."); 2725 VecIdx = parti; 2726 } 2727 2728 // That's the last element of this store op. 2729 if (VectorInfo[parti] & PVF_LAST) { 2730 unsigned NumElts = parti - VecIdx + 1; 2731 EVT EltVT = VTs[parti]; 2732 // i1 is loaded/stored as i8. 2733 EVT LoadVT = EltVT; 2734 if (EltVT == MVT::i1) 2735 LoadVT = MVT::i8; 2736 else if (EltVT == MVT::v2f16) 2737 // getLoad needs a vector type, but it can't handle 2738 // vectors which contain v2f16 elements. So we must load 2739 // using i32 here and then bitcast back. 2740 LoadVT = MVT::i32; 2741 2742 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2743 SDValue VecAddr = 2744 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2745 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2746 Value *srcValue = Constant::getNullValue(PointerType::get( 2747 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2748 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 2749 MachinePointerInfo(srcValue), 2750 MaybeAlign(aggregateIsPacked ? 1 : 0), 2751 MachineMemOperand::MODereferenceable | 2752 MachineMemOperand::MOInvariant); 2753 if (P.getNode()) 2754 P.getNode()->setIROrder(idx + 1); 2755 for (unsigned j = 0; j < NumElts; ++j) { 2756 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2757 DAG.getIntPtrConstant(j, dl)); 2758 // We've loaded i1 as an i8 and now must truncate it back to i1 2759 if (EltVT == MVT::i1) 2760 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2761 // v2f16 was loaded as an i32. Now we must bitcast it back. 2762 else if (EltVT == MVT::v2f16) 2763 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2764 2765 // If a promoted integer type is used, truncate down to the original 2766 MVT PromotedVT; 2767 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 2768 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 2769 } 2770 2771 // Extend the element if necessary (e.g. an i8 is loaded 2772 // into an i16 register) 2773 if (Ins[InsIdx].VT.isInteger() && 2774 Ins[InsIdx].VT.getFixedSizeInBits() > 2775 LoadVT.getFixedSizeInBits()) { 2776 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2777 : ISD::ZERO_EXTEND; 2778 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2779 } 2780 InVals.push_back(Elt); 2781 } 2782 2783 // Reset vector tracking state. 2784 VecIdx = -1; 2785 } 2786 ++InsIdx; 2787 } 2788 if (VTs.size() > 0) 2789 --InsIdx; 2790 continue; 2791 } 2792 2793 // Param has ByVal attribute 2794 // Return MoveParam(param symbol). 2795 // Ideally, the param symbol can be returned directly, 2796 // but when SDNode builder decides to use it in a CopyToReg(), 2797 // machine instruction fails because TargetExternalSymbol 2798 // (not lowered) is target dependent, and CopyToReg assumes 2799 // the source is lowered. 2800 EVT ObjectVT = getValueType(DL, Ty); 2801 assert(ObjectVT == Ins[InsIdx].VT && 2802 "Ins type did not match function type"); 2803 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2804 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2805 if (p.getNode()) 2806 p.getNode()->setIROrder(idx + 1); 2807 InVals.push_back(p); 2808 } 2809 2810 if (!OutChains.empty()) 2811 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2812 2813 return Chain; 2814 } 2815 2816 SDValue 2817 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2818 bool isVarArg, 2819 const SmallVectorImpl<ISD::OutputArg> &Outs, 2820 const SmallVectorImpl<SDValue> &OutVals, 2821 const SDLoc &dl, SelectionDAG &DAG) const { 2822 const MachineFunction &MF = DAG.getMachineFunction(); 2823 const Function &F = MF.getFunction(); 2824 Type *RetTy = MF.getFunction().getReturnType(); 2825 2826 bool isABI = (STI.getSmVersion() >= 20); 2827 assert(isABI && "Non-ABI compilation is not supported"); 2828 if (!isABI) 2829 return Chain; 2830 2831 const DataLayout &DL = DAG.getDataLayout(); 2832 SmallVector<SDValue, 16> PromotedOutVals; 2833 SmallVector<EVT, 16> VTs; 2834 SmallVector<uint64_t, 16> Offsets; 2835 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2836 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2837 2838 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2839 SDValue PromotedOutVal = OutVals[i]; 2840 MVT PromotedVT; 2841 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 2842 VTs[i] = EVT(PromotedVT); 2843 } 2844 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 2845 llvm::ISD::NodeType Ext = 2846 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2847 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 2848 } 2849 PromotedOutVals.push_back(PromotedOutVal); 2850 } 2851 2852 auto VectorInfo = VectorizePTXValueVTs( 2853 VTs, Offsets, 2854 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 2855 : Align(1)); 2856 2857 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2858 // 32-bits are sign extended or zero extended, depending on whether 2859 // they are signed or unsigned types. 2860 bool ExtendIntegerRetVal = 2861 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2862 2863 SmallVector<SDValue, 6> StoreOperands; 2864 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2865 // New load/store. Record chain and offset operands. 2866 if (VectorInfo[i] & PVF_FIRST) { 2867 assert(StoreOperands.empty() && "Orphaned operand list."); 2868 StoreOperands.push_back(Chain); 2869 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2870 } 2871 2872 SDValue OutVal = OutVals[i]; 2873 SDValue RetVal = PromotedOutVals[i]; 2874 2875 if (ExtendIntegerRetVal) { 2876 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2877 : ISD::ZERO_EXTEND, 2878 dl, MVT::i32, RetVal); 2879 } else if (OutVal.getValueSizeInBits() < 16) { 2880 // Use 16-bit registers for small load-stores as it's the 2881 // smallest general purpose register size supported by NVPTX. 2882 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2883 } 2884 2885 // Record the value to return. 2886 StoreOperands.push_back(RetVal); 2887 2888 // That's the last element of this store op. 2889 if (VectorInfo[i] & PVF_LAST) { 2890 NVPTXISD::NodeType Op; 2891 unsigned NumElts = StoreOperands.size() - 2; 2892 switch (NumElts) { 2893 case 1: 2894 Op = NVPTXISD::StoreRetval; 2895 break; 2896 case 2: 2897 Op = NVPTXISD::StoreRetvalV2; 2898 break; 2899 case 4: 2900 Op = NVPTXISD::StoreRetvalV4; 2901 break; 2902 default: 2903 llvm_unreachable("Invalid vector info."); 2904 } 2905 2906 // Adjust type of load/store op if we've extended the scalar 2907 // return value. 2908 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2909 Chain = DAG.getMemIntrinsicNode( 2910 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 2911 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 2912 // Cleanup vector state. 2913 StoreOperands.clear(); 2914 } 2915 } 2916 2917 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2918 } 2919 2920 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2921 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2922 SelectionDAG &DAG) const { 2923 if (Constraint.length() > 1) 2924 return; 2925 else 2926 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2927 } 2928 2929 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2930 switch (Intrinsic) { 2931 default: 2932 return 0; 2933 2934 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2935 return NVPTXISD::Tex1DFloatS32; 2936 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2937 return NVPTXISD::Tex1DFloatFloat; 2938 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2939 return NVPTXISD::Tex1DFloatFloatLevel; 2940 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2941 return NVPTXISD::Tex1DFloatFloatGrad; 2942 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2943 return NVPTXISD::Tex1DS32S32; 2944 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2945 return NVPTXISD::Tex1DS32Float; 2946 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2947 return NVPTXISD::Tex1DS32FloatLevel; 2948 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2949 return NVPTXISD::Tex1DS32FloatGrad; 2950 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2951 return NVPTXISD::Tex1DU32S32; 2952 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2953 return NVPTXISD::Tex1DU32Float; 2954 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2955 return NVPTXISD::Tex1DU32FloatLevel; 2956 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2957 return NVPTXISD::Tex1DU32FloatGrad; 2958 2959 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2960 return NVPTXISD::Tex1DArrayFloatS32; 2961 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2962 return NVPTXISD::Tex1DArrayFloatFloat; 2963 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2964 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2965 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2966 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2967 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2968 return NVPTXISD::Tex1DArrayS32S32; 2969 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2970 return NVPTXISD::Tex1DArrayS32Float; 2971 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2972 return NVPTXISD::Tex1DArrayS32FloatLevel; 2973 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2974 return NVPTXISD::Tex1DArrayS32FloatGrad; 2975 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2976 return NVPTXISD::Tex1DArrayU32S32; 2977 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2978 return NVPTXISD::Tex1DArrayU32Float; 2979 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2980 return NVPTXISD::Tex1DArrayU32FloatLevel; 2981 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2982 return NVPTXISD::Tex1DArrayU32FloatGrad; 2983 2984 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2985 return NVPTXISD::Tex2DFloatS32; 2986 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2987 return NVPTXISD::Tex2DFloatFloat; 2988 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2989 return NVPTXISD::Tex2DFloatFloatLevel; 2990 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2991 return NVPTXISD::Tex2DFloatFloatGrad; 2992 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2993 return NVPTXISD::Tex2DS32S32; 2994 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2995 return NVPTXISD::Tex2DS32Float; 2996 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2997 return NVPTXISD::Tex2DS32FloatLevel; 2998 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2999 return NVPTXISD::Tex2DS32FloatGrad; 3000 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3001 return NVPTXISD::Tex2DU32S32; 3002 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3003 return NVPTXISD::Tex2DU32Float; 3004 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3005 return NVPTXISD::Tex2DU32FloatLevel; 3006 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3007 return NVPTXISD::Tex2DU32FloatGrad; 3008 3009 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3010 return NVPTXISD::Tex2DArrayFloatS32; 3011 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3012 return NVPTXISD::Tex2DArrayFloatFloat; 3013 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3014 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3015 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3016 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3017 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3018 return NVPTXISD::Tex2DArrayS32S32; 3019 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3020 return NVPTXISD::Tex2DArrayS32Float; 3021 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3022 return NVPTXISD::Tex2DArrayS32FloatLevel; 3023 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3024 return NVPTXISD::Tex2DArrayS32FloatGrad; 3025 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3026 return NVPTXISD::Tex2DArrayU32S32; 3027 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3028 return NVPTXISD::Tex2DArrayU32Float; 3029 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3030 return NVPTXISD::Tex2DArrayU32FloatLevel; 3031 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3032 return NVPTXISD::Tex2DArrayU32FloatGrad; 3033 3034 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3035 return NVPTXISD::Tex3DFloatS32; 3036 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3037 return NVPTXISD::Tex3DFloatFloat; 3038 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3039 return NVPTXISD::Tex3DFloatFloatLevel; 3040 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3041 return NVPTXISD::Tex3DFloatFloatGrad; 3042 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3043 return NVPTXISD::Tex3DS32S32; 3044 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3045 return NVPTXISD::Tex3DS32Float; 3046 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3047 return NVPTXISD::Tex3DS32FloatLevel; 3048 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3049 return NVPTXISD::Tex3DS32FloatGrad; 3050 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3051 return NVPTXISD::Tex3DU32S32; 3052 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3053 return NVPTXISD::Tex3DU32Float; 3054 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3055 return NVPTXISD::Tex3DU32FloatLevel; 3056 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3057 return NVPTXISD::Tex3DU32FloatGrad; 3058 3059 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3060 return NVPTXISD::TexCubeFloatFloat; 3061 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3062 return NVPTXISD::TexCubeFloatFloatLevel; 3063 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3064 return NVPTXISD::TexCubeS32Float; 3065 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3066 return NVPTXISD::TexCubeS32FloatLevel; 3067 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3068 return NVPTXISD::TexCubeU32Float; 3069 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3070 return NVPTXISD::TexCubeU32FloatLevel; 3071 3072 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3073 return NVPTXISD::TexCubeArrayFloatFloat; 3074 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3075 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3076 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3077 return NVPTXISD::TexCubeArrayS32Float; 3078 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3079 return NVPTXISD::TexCubeArrayS32FloatLevel; 3080 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3081 return NVPTXISD::TexCubeArrayU32Float; 3082 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3083 return NVPTXISD::TexCubeArrayU32FloatLevel; 3084 3085 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3086 return NVPTXISD::Tld4R2DFloatFloat; 3087 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3088 return NVPTXISD::Tld4G2DFloatFloat; 3089 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3090 return NVPTXISD::Tld4B2DFloatFloat; 3091 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3092 return NVPTXISD::Tld4A2DFloatFloat; 3093 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3094 return NVPTXISD::Tld4R2DS64Float; 3095 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3096 return NVPTXISD::Tld4G2DS64Float; 3097 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3098 return NVPTXISD::Tld4B2DS64Float; 3099 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3100 return NVPTXISD::Tld4A2DS64Float; 3101 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3102 return NVPTXISD::Tld4R2DU64Float; 3103 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3104 return NVPTXISD::Tld4G2DU64Float; 3105 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3106 return NVPTXISD::Tld4B2DU64Float; 3107 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3108 return NVPTXISD::Tld4A2DU64Float; 3109 3110 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3111 return NVPTXISD::TexUnified1DFloatS32; 3112 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3113 return NVPTXISD::TexUnified1DFloatFloat; 3114 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3115 return NVPTXISD::TexUnified1DFloatFloatLevel; 3116 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3117 return NVPTXISD::TexUnified1DFloatFloatGrad; 3118 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3119 return NVPTXISD::TexUnified1DS32S32; 3120 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3121 return NVPTXISD::TexUnified1DS32Float; 3122 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3123 return NVPTXISD::TexUnified1DS32FloatLevel; 3124 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3125 return NVPTXISD::TexUnified1DS32FloatGrad; 3126 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3127 return NVPTXISD::TexUnified1DU32S32; 3128 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3129 return NVPTXISD::TexUnified1DU32Float; 3130 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3131 return NVPTXISD::TexUnified1DU32FloatLevel; 3132 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3133 return NVPTXISD::TexUnified1DU32FloatGrad; 3134 3135 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3136 return NVPTXISD::TexUnified1DArrayFloatS32; 3137 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3138 return NVPTXISD::TexUnified1DArrayFloatFloat; 3139 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3140 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3141 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3142 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3143 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3144 return NVPTXISD::TexUnified1DArrayS32S32; 3145 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3146 return NVPTXISD::TexUnified1DArrayS32Float; 3147 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3148 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3149 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3150 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3151 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3152 return NVPTXISD::TexUnified1DArrayU32S32; 3153 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3154 return NVPTXISD::TexUnified1DArrayU32Float; 3155 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3156 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3157 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3158 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3159 3160 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3161 return NVPTXISD::TexUnified2DFloatS32; 3162 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3163 return NVPTXISD::TexUnified2DFloatFloat; 3164 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3165 return NVPTXISD::TexUnified2DFloatFloatLevel; 3166 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3167 return NVPTXISD::TexUnified2DFloatFloatGrad; 3168 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3169 return NVPTXISD::TexUnified2DS32S32; 3170 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3171 return NVPTXISD::TexUnified2DS32Float; 3172 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3173 return NVPTXISD::TexUnified2DS32FloatLevel; 3174 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3175 return NVPTXISD::TexUnified2DS32FloatGrad; 3176 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3177 return NVPTXISD::TexUnified2DU32S32; 3178 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3179 return NVPTXISD::TexUnified2DU32Float; 3180 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3181 return NVPTXISD::TexUnified2DU32FloatLevel; 3182 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3183 return NVPTXISD::TexUnified2DU32FloatGrad; 3184 3185 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3186 return NVPTXISD::TexUnified2DArrayFloatS32; 3187 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3188 return NVPTXISD::TexUnified2DArrayFloatFloat; 3189 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3190 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3191 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3192 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3193 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3194 return NVPTXISD::TexUnified2DArrayS32S32; 3195 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3196 return NVPTXISD::TexUnified2DArrayS32Float; 3197 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3198 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3199 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3200 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3201 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3202 return NVPTXISD::TexUnified2DArrayU32S32; 3203 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3204 return NVPTXISD::TexUnified2DArrayU32Float; 3205 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3206 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3207 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3208 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3209 3210 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3211 return NVPTXISD::TexUnified3DFloatS32; 3212 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3213 return NVPTXISD::TexUnified3DFloatFloat; 3214 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3215 return NVPTXISD::TexUnified3DFloatFloatLevel; 3216 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3217 return NVPTXISD::TexUnified3DFloatFloatGrad; 3218 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3219 return NVPTXISD::TexUnified3DS32S32; 3220 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3221 return NVPTXISD::TexUnified3DS32Float; 3222 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3223 return NVPTXISD::TexUnified3DS32FloatLevel; 3224 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3225 return NVPTXISD::TexUnified3DS32FloatGrad; 3226 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3227 return NVPTXISD::TexUnified3DU32S32; 3228 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3229 return NVPTXISD::TexUnified3DU32Float; 3230 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3231 return NVPTXISD::TexUnified3DU32FloatLevel; 3232 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3233 return NVPTXISD::TexUnified3DU32FloatGrad; 3234 3235 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3236 return NVPTXISD::TexUnifiedCubeFloatFloat; 3237 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3238 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3239 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3240 return NVPTXISD::TexUnifiedCubeS32Float; 3241 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3242 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3243 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3244 return NVPTXISD::TexUnifiedCubeU32Float; 3245 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3246 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3247 3248 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3249 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3250 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3251 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3252 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3253 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3254 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3255 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3256 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3257 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3258 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3259 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3260 3261 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3262 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3263 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3264 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3265 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3266 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3267 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3268 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3269 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3270 return NVPTXISD::Tld4UnifiedR2DS64Float; 3271 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3272 return NVPTXISD::Tld4UnifiedG2DS64Float; 3273 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3274 return NVPTXISD::Tld4UnifiedB2DS64Float; 3275 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3276 return NVPTXISD::Tld4UnifiedA2DS64Float; 3277 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3278 return NVPTXISD::Tld4UnifiedR2DU64Float; 3279 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3280 return NVPTXISD::Tld4UnifiedG2DU64Float; 3281 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3282 return NVPTXISD::Tld4UnifiedB2DU64Float; 3283 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3284 return NVPTXISD::Tld4UnifiedA2DU64Float; 3285 } 3286 } 3287 3288 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3289 switch (Intrinsic) { 3290 default: 3291 return 0; 3292 case Intrinsic::nvvm_suld_1d_i8_clamp: 3293 return NVPTXISD::Suld1DI8Clamp; 3294 case Intrinsic::nvvm_suld_1d_i16_clamp: 3295 return NVPTXISD::Suld1DI16Clamp; 3296 case Intrinsic::nvvm_suld_1d_i32_clamp: 3297 return NVPTXISD::Suld1DI32Clamp; 3298 case Intrinsic::nvvm_suld_1d_i64_clamp: 3299 return NVPTXISD::Suld1DI64Clamp; 3300 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3301 return NVPTXISD::Suld1DV2I8Clamp; 3302 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3303 return NVPTXISD::Suld1DV2I16Clamp; 3304 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3305 return NVPTXISD::Suld1DV2I32Clamp; 3306 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3307 return NVPTXISD::Suld1DV2I64Clamp; 3308 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3309 return NVPTXISD::Suld1DV4I8Clamp; 3310 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3311 return NVPTXISD::Suld1DV4I16Clamp; 3312 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3313 return NVPTXISD::Suld1DV4I32Clamp; 3314 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3315 return NVPTXISD::Suld1DArrayI8Clamp; 3316 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3317 return NVPTXISD::Suld1DArrayI16Clamp; 3318 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3319 return NVPTXISD::Suld1DArrayI32Clamp; 3320 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3321 return NVPTXISD::Suld1DArrayI64Clamp; 3322 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3323 return NVPTXISD::Suld1DArrayV2I8Clamp; 3324 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3325 return NVPTXISD::Suld1DArrayV2I16Clamp; 3326 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3327 return NVPTXISD::Suld1DArrayV2I32Clamp; 3328 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3329 return NVPTXISD::Suld1DArrayV2I64Clamp; 3330 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3331 return NVPTXISD::Suld1DArrayV4I8Clamp; 3332 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3333 return NVPTXISD::Suld1DArrayV4I16Clamp; 3334 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3335 return NVPTXISD::Suld1DArrayV4I32Clamp; 3336 case Intrinsic::nvvm_suld_2d_i8_clamp: 3337 return NVPTXISD::Suld2DI8Clamp; 3338 case Intrinsic::nvvm_suld_2d_i16_clamp: 3339 return NVPTXISD::Suld2DI16Clamp; 3340 case Intrinsic::nvvm_suld_2d_i32_clamp: 3341 return NVPTXISD::Suld2DI32Clamp; 3342 case Intrinsic::nvvm_suld_2d_i64_clamp: 3343 return NVPTXISD::Suld2DI64Clamp; 3344 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3345 return NVPTXISD::Suld2DV2I8Clamp; 3346 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3347 return NVPTXISD::Suld2DV2I16Clamp; 3348 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3349 return NVPTXISD::Suld2DV2I32Clamp; 3350 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3351 return NVPTXISD::Suld2DV2I64Clamp; 3352 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3353 return NVPTXISD::Suld2DV4I8Clamp; 3354 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3355 return NVPTXISD::Suld2DV4I16Clamp; 3356 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3357 return NVPTXISD::Suld2DV4I32Clamp; 3358 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3359 return NVPTXISD::Suld2DArrayI8Clamp; 3360 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3361 return NVPTXISD::Suld2DArrayI16Clamp; 3362 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3363 return NVPTXISD::Suld2DArrayI32Clamp; 3364 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3365 return NVPTXISD::Suld2DArrayI64Clamp; 3366 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3367 return NVPTXISD::Suld2DArrayV2I8Clamp; 3368 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3369 return NVPTXISD::Suld2DArrayV2I16Clamp; 3370 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3371 return NVPTXISD::Suld2DArrayV2I32Clamp; 3372 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3373 return NVPTXISD::Suld2DArrayV2I64Clamp; 3374 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3375 return NVPTXISD::Suld2DArrayV4I8Clamp; 3376 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3377 return NVPTXISD::Suld2DArrayV4I16Clamp; 3378 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3379 return NVPTXISD::Suld2DArrayV4I32Clamp; 3380 case Intrinsic::nvvm_suld_3d_i8_clamp: 3381 return NVPTXISD::Suld3DI8Clamp; 3382 case Intrinsic::nvvm_suld_3d_i16_clamp: 3383 return NVPTXISD::Suld3DI16Clamp; 3384 case Intrinsic::nvvm_suld_3d_i32_clamp: 3385 return NVPTXISD::Suld3DI32Clamp; 3386 case Intrinsic::nvvm_suld_3d_i64_clamp: 3387 return NVPTXISD::Suld3DI64Clamp; 3388 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3389 return NVPTXISD::Suld3DV2I8Clamp; 3390 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3391 return NVPTXISD::Suld3DV2I16Clamp; 3392 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3393 return NVPTXISD::Suld3DV2I32Clamp; 3394 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3395 return NVPTXISD::Suld3DV2I64Clamp; 3396 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3397 return NVPTXISD::Suld3DV4I8Clamp; 3398 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3399 return NVPTXISD::Suld3DV4I16Clamp; 3400 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3401 return NVPTXISD::Suld3DV4I32Clamp; 3402 case Intrinsic::nvvm_suld_1d_i8_trap: 3403 return NVPTXISD::Suld1DI8Trap; 3404 case Intrinsic::nvvm_suld_1d_i16_trap: 3405 return NVPTXISD::Suld1DI16Trap; 3406 case Intrinsic::nvvm_suld_1d_i32_trap: 3407 return NVPTXISD::Suld1DI32Trap; 3408 case Intrinsic::nvvm_suld_1d_i64_trap: 3409 return NVPTXISD::Suld1DI64Trap; 3410 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3411 return NVPTXISD::Suld1DV2I8Trap; 3412 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3413 return NVPTXISD::Suld1DV2I16Trap; 3414 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3415 return NVPTXISD::Suld1DV2I32Trap; 3416 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3417 return NVPTXISD::Suld1DV2I64Trap; 3418 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3419 return NVPTXISD::Suld1DV4I8Trap; 3420 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3421 return NVPTXISD::Suld1DV4I16Trap; 3422 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3423 return NVPTXISD::Suld1DV4I32Trap; 3424 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3425 return NVPTXISD::Suld1DArrayI8Trap; 3426 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3427 return NVPTXISD::Suld1DArrayI16Trap; 3428 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3429 return NVPTXISD::Suld1DArrayI32Trap; 3430 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3431 return NVPTXISD::Suld1DArrayI64Trap; 3432 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3433 return NVPTXISD::Suld1DArrayV2I8Trap; 3434 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3435 return NVPTXISD::Suld1DArrayV2I16Trap; 3436 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3437 return NVPTXISD::Suld1DArrayV2I32Trap; 3438 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3439 return NVPTXISD::Suld1DArrayV2I64Trap; 3440 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3441 return NVPTXISD::Suld1DArrayV4I8Trap; 3442 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3443 return NVPTXISD::Suld1DArrayV4I16Trap; 3444 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3445 return NVPTXISD::Suld1DArrayV4I32Trap; 3446 case Intrinsic::nvvm_suld_2d_i8_trap: 3447 return NVPTXISD::Suld2DI8Trap; 3448 case Intrinsic::nvvm_suld_2d_i16_trap: 3449 return NVPTXISD::Suld2DI16Trap; 3450 case Intrinsic::nvvm_suld_2d_i32_trap: 3451 return NVPTXISD::Suld2DI32Trap; 3452 case Intrinsic::nvvm_suld_2d_i64_trap: 3453 return NVPTXISD::Suld2DI64Trap; 3454 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3455 return NVPTXISD::Suld2DV2I8Trap; 3456 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3457 return NVPTXISD::Suld2DV2I16Trap; 3458 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3459 return NVPTXISD::Suld2DV2I32Trap; 3460 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3461 return NVPTXISD::Suld2DV2I64Trap; 3462 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3463 return NVPTXISD::Suld2DV4I8Trap; 3464 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3465 return NVPTXISD::Suld2DV4I16Trap; 3466 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3467 return NVPTXISD::Suld2DV4I32Trap; 3468 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3469 return NVPTXISD::Suld2DArrayI8Trap; 3470 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3471 return NVPTXISD::Suld2DArrayI16Trap; 3472 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3473 return NVPTXISD::Suld2DArrayI32Trap; 3474 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3475 return NVPTXISD::Suld2DArrayI64Trap; 3476 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3477 return NVPTXISD::Suld2DArrayV2I8Trap; 3478 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3479 return NVPTXISD::Suld2DArrayV2I16Trap; 3480 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3481 return NVPTXISD::Suld2DArrayV2I32Trap; 3482 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3483 return NVPTXISD::Suld2DArrayV2I64Trap; 3484 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3485 return NVPTXISD::Suld2DArrayV4I8Trap; 3486 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3487 return NVPTXISD::Suld2DArrayV4I16Trap; 3488 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3489 return NVPTXISD::Suld2DArrayV4I32Trap; 3490 case Intrinsic::nvvm_suld_3d_i8_trap: 3491 return NVPTXISD::Suld3DI8Trap; 3492 case Intrinsic::nvvm_suld_3d_i16_trap: 3493 return NVPTXISD::Suld3DI16Trap; 3494 case Intrinsic::nvvm_suld_3d_i32_trap: 3495 return NVPTXISD::Suld3DI32Trap; 3496 case Intrinsic::nvvm_suld_3d_i64_trap: 3497 return NVPTXISD::Suld3DI64Trap; 3498 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3499 return NVPTXISD::Suld3DV2I8Trap; 3500 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3501 return NVPTXISD::Suld3DV2I16Trap; 3502 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3503 return NVPTXISD::Suld3DV2I32Trap; 3504 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3505 return NVPTXISD::Suld3DV2I64Trap; 3506 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3507 return NVPTXISD::Suld3DV4I8Trap; 3508 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3509 return NVPTXISD::Suld3DV4I16Trap; 3510 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3511 return NVPTXISD::Suld3DV4I32Trap; 3512 case Intrinsic::nvvm_suld_1d_i8_zero: 3513 return NVPTXISD::Suld1DI8Zero; 3514 case Intrinsic::nvvm_suld_1d_i16_zero: 3515 return NVPTXISD::Suld1DI16Zero; 3516 case Intrinsic::nvvm_suld_1d_i32_zero: 3517 return NVPTXISD::Suld1DI32Zero; 3518 case Intrinsic::nvvm_suld_1d_i64_zero: 3519 return NVPTXISD::Suld1DI64Zero; 3520 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3521 return NVPTXISD::Suld1DV2I8Zero; 3522 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3523 return NVPTXISD::Suld1DV2I16Zero; 3524 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3525 return NVPTXISD::Suld1DV2I32Zero; 3526 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3527 return NVPTXISD::Suld1DV2I64Zero; 3528 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3529 return NVPTXISD::Suld1DV4I8Zero; 3530 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3531 return NVPTXISD::Suld1DV4I16Zero; 3532 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3533 return NVPTXISD::Suld1DV4I32Zero; 3534 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3535 return NVPTXISD::Suld1DArrayI8Zero; 3536 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3537 return NVPTXISD::Suld1DArrayI16Zero; 3538 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3539 return NVPTXISD::Suld1DArrayI32Zero; 3540 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3541 return NVPTXISD::Suld1DArrayI64Zero; 3542 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3543 return NVPTXISD::Suld1DArrayV2I8Zero; 3544 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3545 return NVPTXISD::Suld1DArrayV2I16Zero; 3546 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3547 return NVPTXISD::Suld1DArrayV2I32Zero; 3548 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3549 return NVPTXISD::Suld1DArrayV2I64Zero; 3550 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3551 return NVPTXISD::Suld1DArrayV4I8Zero; 3552 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3553 return NVPTXISD::Suld1DArrayV4I16Zero; 3554 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3555 return NVPTXISD::Suld1DArrayV4I32Zero; 3556 case Intrinsic::nvvm_suld_2d_i8_zero: 3557 return NVPTXISD::Suld2DI8Zero; 3558 case Intrinsic::nvvm_suld_2d_i16_zero: 3559 return NVPTXISD::Suld2DI16Zero; 3560 case Intrinsic::nvvm_suld_2d_i32_zero: 3561 return NVPTXISD::Suld2DI32Zero; 3562 case Intrinsic::nvvm_suld_2d_i64_zero: 3563 return NVPTXISD::Suld2DI64Zero; 3564 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3565 return NVPTXISD::Suld2DV2I8Zero; 3566 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3567 return NVPTXISD::Suld2DV2I16Zero; 3568 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3569 return NVPTXISD::Suld2DV2I32Zero; 3570 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3571 return NVPTXISD::Suld2DV2I64Zero; 3572 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3573 return NVPTXISD::Suld2DV4I8Zero; 3574 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3575 return NVPTXISD::Suld2DV4I16Zero; 3576 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3577 return NVPTXISD::Suld2DV4I32Zero; 3578 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3579 return NVPTXISD::Suld2DArrayI8Zero; 3580 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3581 return NVPTXISD::Suld2DArrayI16Zero; 3582 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3583 return NVPTXISD::Suld2DArrayI32Zero; 3584 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3585 return NVPTXISD::Suld2DArrayI64Zero; 3586 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3587 return NVPTXISD::Suld2DArrayV2I8Zero; 3588 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3589 return NVPTXISD::Suld2DArrayV2I16Zero; 3590 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3591 return NVPTXISD::Suld2DArrayV2I32Zero; 3592 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3593 return NVPTXISD::Suld2DArrayV2I64Zero; 3594 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3595 return NVPTXISD::Suld2DArrayV4I8Zero; 3596 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3597 return NVPTXISD::Suld2DArrayV4I16Zero; 3598 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3599 return NVPTXISD::Suld2DArrayV4I32Zero; 3600 case Intrinsic::nvvm_suld_3d_i8_zero: 3601 return NVPTXISD::Suld3DI8Zero; 3602 case Intrinsic::nvvm_suld_3d_i16_zero: 3603 return NVPTXISD::Suld3DI16Zero; 3604 case Intrinsic::nvvm_suld_3d_i32_zero: 3605 return NVPTXISD::Suld3DI32Zero; 3606 case Intrinsic::nvvm_suld_3d_i64_zero: 3607 return NVPTXISD::Suld3DI64Zero; 3608 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3609 return NVPTXISD::Suld3DV2I8Zero; 3610 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3611 return NVPTXISD::Suld3DV2I16Zero; 3612 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3613 return NVPTXISD::Suld3DV2I32Zero; 3614 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3615 return NVPTXISD::Suld3DV2I64Zero; 3616 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3617 return NVPTXISD::Suld3DV4I8Zero; 3618 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3619 return NVPTXISD::Suld3DV4I16Zero; 3620 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3621 return NVPTXISD::Suld3DV4I32Zero; 3622 } 3623 } 3624 3625 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3626 // TgtMemIntrinsic 3627 // because we need the information that is only available in the "Value" type 3628 // of destination 3629 // pointer. In particular, the address space information. 3630 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3631 IntrinsicInfo &Info, const CallInst &I, 3632 MachineFunction &MF, unsigned Intrinsic) const { 3633 switch (Intrinsic) { 3634 default: 3635 return false; 3636 case Intrinsic::nvvm_match_all_sync_i32p: 3637 case Intrinsic::nvvm_match_all_sync_i64p: 3638 Info.opc = ISD::INTRINSIC_W_CHAIN; 3639 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3640 // in order to model data exchange with other threads, but perform no real 3641 // memory accesses. 3642 Info.memVT = MVT::i1; 3643 3644 // Our result depends on both our and other thread's arguments. 3645 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3646 return true; 3647 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3648 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3649 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3650 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3651 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3652 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3653 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3654 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3655 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3656 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3657 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3658 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3659 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3660 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3661 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3662 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3663 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3664 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3665 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3666 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3667 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3668 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3669 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3670 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3671 Info.opc = ISD::INTRINSIC_W_CHAIN; 3672 Info.memVT = MVT::v8f16; 3673 Info.ptrVal = I.getArgOperand(0); 3674 Info.offset = 0; 3675 Info.flags = MachineMemOperand::MOLoad; 3676 Info.align = Align(16); 3677 return true; 3678 } 3679 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3680 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3681 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3682 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3683 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3684 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3685 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3686 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3687 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 3688 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 3689 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 3690 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 3691 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3692 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3693 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3694 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3695 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3696 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3697 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3698 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 3699 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 3700 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 3701 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 3702 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 3703 Info.opc = ISD::INTRINSIC_W_CHAIN; 3704 Info.memVT = MVT::v2i32; 3705 Info.ptrVal = I.getArgOperand(0); 3706 Info.offset = 0; 3707 Info.flags = MachineMemOperand::MOLoad; 3708 Info.align = Align(8); 3709 return true; 3710 } 3711 3712 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3713 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3714 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3715 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3716 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3717 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3718 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3719 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3720 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 3721 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 3722 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 3723 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 3724 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 3725 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 3726 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 3727 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 3728 3729 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3730 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3731 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3732 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3733 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3734 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3735 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3736 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 3737 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 3738 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 3739 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 3740 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 3741 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 3742 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 3743 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 3744 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 3745 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 3746 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 3747 Info.opc = ISD::INTRINSIC_W_CHAIN; 3748 Info.memVT = MVT::v4i32; 3749 Info.ptrVal = I.getArgOperand(0); 3750 Info.offset = 0; 3751 Info.flags = MachineMemOperand::MOLoad; 3752 Info.align = Align(16); 3753 return true; 3754 } 3755 3756 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3757 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3758 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3759 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3760 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3761 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3762 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3763 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3764 3765 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3766 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3767 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3768 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3769 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3770 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3771 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3772 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3773 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3774 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3775 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3776 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3777 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3778 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3779 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3780 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3781 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3782 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3783 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3784 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 3785 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 3786 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 3787 Info.opc = ISD::INTRINSIC_W_CHAIN; 3788 Info.memVT = MVT::i32; 3789 Info.ptrVal = I.getArgOperand(0); 3790 Info.offset = 0; 3791 Info.flags = MachineMemOperand::MOLoad; 3792 Info.align = Align(4); 3793 return true; 3794 } 3795 3796 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3797 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3798 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3799 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3800 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3801 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3802 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3803 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3804 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3805 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3806 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3807 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3808 Info.opc = ISD::INTRINSIC_W_CHAIN; 3809 Info.memVT = MVT::v4f16; 3810 Info.ptrVal = I.getArgOperand(0); 3811 Info.offset = 0; 3812 Info.flags = MachineMemOperand::MOLoad; 3813 Info.align = Align(16); 3814 return true; 3815 } 3816 3817 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3818 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3819 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3820 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3821 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3822 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3823 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3824 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3825 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3826 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3827 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3828 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 3829 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 3830 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 3831 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 3832 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 3833 Info.opc = ISD::INTRINSIC_W_CHAIN; 3834 Info.memVT = MVT::v8f32; 3835 Info.ptrVal = I.getArgOperand(0); 3836 Info.offset = 0; 3837 Info.flags = MachineMemOperand::MOLoad; 3838 Info.align = Align(16); 3839 return true; 3840 } 3841 3842 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 3843 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 3844 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 3845 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 3846 3847 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 3848 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 3849 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 3850 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 3851 3852 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3853 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3854 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3855 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3856 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3857 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3858 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3859 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3860 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3861 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3862 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3863 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3864 Info.opc = ISD::INTRINSIC_W_CHAIN; 3865 Info.memVT = MVT::v8i32; 3866 Info.ptrVal = I.getArgOperand(0); 3867 Info.offset = 0; 3868 Info.flags = MachineMemOperand::MOLoad; 3869 Info.align = Align(16); 3870 return true; 3871 } 3872 3873 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3874 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3875 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3876 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3877 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3878 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3879 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3880 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 3881 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 3882 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 3883 Info.opc = ISD::INTRINSIC_W_CHAIN; 3884 Info.memVT = MVT::v2i32; 3885 Info.ptrVal = I.getArgOperand(0); 3886 Info.offset = 0; 3887 Info.flags = MachineMemOperand::MOLoad; 3888 Info.align = Align(8); 3889 return true; 3890 } 3891 3892 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 3893 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 3894 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 3895 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 3896 3897 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 3898 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 3899 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 3900 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 3901 Info.opc = ISD::INTRINSIC_W_CHAIN; 3902 Info.memVT = MVT::f64; 3903 Info.ptrVal = I.getArgOperand(0); 3904 Info.offset = 0; 3905 Info.flags = MachineMemOperand::MOLoad; 3906 Info.align = Align(8); 3907 return true; 3908 } 3909 3910 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 3911 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 3912 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 3913 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 3914 Info.opc = ISD::INTRINSIC_W_CHAIN; 3915 Info.memVT = MVT::v2f64; 3916 Info.ptrVal = I.getArgOperand(0); 3917 Info.offset = 0; 3918 Info.flags = MachineMemOperand::MOLoad; 3919 Info.align = Align(16); 3920 return true; 3921 } 3922 3923 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3924 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3925 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3926 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3927 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3928 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3929 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3930 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3931 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3932 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3933 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3934 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3935 Info.opc = ISD::INTRINSIC_VOID; 3936 Info.memVT = MVT::v4f16; 3937 Info.ptrVal = I.getArgOperand(0); 3938 Info.offset = 0; 3939 Info.flags = MachineMemOperand::MOStore; 3940 Info.align = Align(16); 3941 return true; 3942 } 3943 3944 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3945 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3946 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3947 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3948 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3949 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3950 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3951 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3952 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3953 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3954 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3955 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 3956 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 3957 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 3958 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 3959 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 3960 Info.opc = ISD::INTRINSIC_VOID; 3961 Info.memVT = MVT::v8f32; 3962 Info.ptrVal = I.getArgOperand(0); 3963 Info.offset = 0; 3964 Info.flags = MachineMemOperand::MOStore; 3965 Info.align = Align(16); 3966 return true; 3967 } 3968 3969 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3970 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3971 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3972 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3973 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3974 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3975 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3976 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3977 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3978 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3979 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3980 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3981 Info.opc = ISD::INTRINSIC_VOID; 3982 Info.memVT = MVT::v8i32; 3983 Info.ptrVal = I.getArgOperand(0); 3984 Info.offset = 0; 3985 Info.flags = MachineMemOperand::MOStore; 3986 Info.align = Align(16); 3987 return true; 3988 } 3989 3990 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3991 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3992 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3993 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3994 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3995 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3996 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3997 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3998 Info.opc = ISD::INTRINSIC_VOID; 3999 Info.memVT = MVT::v2i32; 4000 Info.ptrVal = I.getArgOperand(0); 4001 Info.offset = 0; 4002 Info.flags = MachineMemOperand::MOStore; 4003 Info.align = Align(8); 4004 return true; 4005 } 4006 4007 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4008 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4009 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4010 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4011 Info.opc = ISD::INTRINSIC_VOID; 4012 Info.memVT = MVT::v2f64; 4013 Info.ptrVal = I.getArgOperand(0); 4014 Info.offset = 0; 4015 Info.flags = MachineMemOperand::MOStore; 4016 Info.align = Align(16); 4017 return true; 4018 } 4019 4020 case Intrinsic::nvvm_atomic_load_inc_32: 4021 case Intrinsic::nvvm_atomic_load_dec_32: 4022 4023 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4024 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4025 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4026 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4027 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4028 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4029 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4030 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4031 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4032 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4033 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4034 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4035 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4036 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4037 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4038 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4039 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4040 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4041 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4042 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4043 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4044 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4045 auto &DL = I.getModule()->getDataLayout(); 4046 Info.opc = ISD::INTRINSIC_W_CHAIN; 4047 Info.memVT = getValueType(DL, I.getType()); 4048 Info.ptrVal = I.getArgOperand(0); 4049 Info.offset = 0; 4050 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4051 Info.align.reset(); 4052 return true; 4053 } 4054 4055 case Intrinsic::nvvm_ldu_global_i: 4056 case Intrinsic::nvvm_ldu_global_f: 4057 case Intrinsic::nvvm_ldu_global_p: { 4058 auto &DL = I.getModule()->getDataLayout(); 4059 Info.opc = ISD::INTRINSIC_W_CHAIN; 4060 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4061 Info.memVT = getValueType(DL, I.getType()); 4062 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4063 Info.memVT = getPointerTy(DL); 4064 else 4065 Info.memVT = getValueType(DL, I.getType()); 4066 Info.ptrVal = I.getArgOperand(0); 4067 Info.offset = 0; 4068 Info.flags = MachineMemOperand::MOLoad; 4069 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4070 4071 return true; 4072 } 4073 case Intrinsic::nvvm_ldg_global_i: 4074 case Intrinsic::nvvm_ldg_global_f: 4075 case Intrinsic::nvvm_ldg_global_p: { 4076 auto &DL = I.getModule()->getDataLayout(); 4077 4078 Info.opc = ISD::INTRINSIC_W_CHAIN; 4079 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4080 Info.memVT = getValueType(DL, I.getType()); 4081 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4082 Info.memVT = getPointerTy(DL); 4083 else 4084 Info.memVT = getValueType(DL, I.getType()); 4085 Info.ptrVal = I.getArgOperand(0); 4086 Info.offset = 0; 4087 Info.flags = MachineMemOperand::MOLoad; 4088 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4089 4090 return true; 4091 } 4092 4093 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4094 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4095 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4096 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4097 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4098 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4099 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4100 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4101 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4102 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4103 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4104 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4105 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4106 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4107 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4108 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4109 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4110 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4111 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4112 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4113 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4114 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4115 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4116 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4117 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4118 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4119 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4120 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4121 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4122 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4123 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4124 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4125 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4126 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4127 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4128 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4129 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4130 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4131 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4132 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4133 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4134 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4135 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4136 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4137 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4138 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4139 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4140 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4141 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4142 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4143 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4144 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4145 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4146 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4147 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4148 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4149 Info.opc = getOpcForTextureInstr(Intrinsic); 4150 Info.memVT = MVT::v4f32; 4151 Info.ptrVal = nullptr; 4152 Info.offset = 0; 4153 Info.flags = MachineMemOperand::MOLoad; 4154 Info.align = Align(16); 4155 return true; 4156 4157 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4158 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4159 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4160 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4161 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4162 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4163 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4164 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4165 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4166 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4167 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4168 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4169 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4170 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4171 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4172 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4173 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4174 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4175 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4176 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4177 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4178 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4179 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4180 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4181 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4182 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4183 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4184 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4185 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4186 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4187 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4188 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4189 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4190 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4191 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4192 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4193 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4194 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4195 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4196 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4197 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4198 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4199 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4200 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4201 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4202 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4203 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4204 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4205 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4206 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4207 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4208 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4209 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4210 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4211 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4212 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4213 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4214 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4215 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4216 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4217 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4218 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4219 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4220 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4221 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4222 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4223 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4224 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4225 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4226 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4227 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4228 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4229 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4230 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4231 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4232 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4233 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4234 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4235 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4236 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4237 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4238 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4239 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4240 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4241 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4242 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4243 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4244 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4245 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4246 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4247 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4248 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4249 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4250 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4251 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4252 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4253 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4254 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4255 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4256 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4257 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4258 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4259 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4260 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4261 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4262 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4263 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4264 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4265 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4266 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4267 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4268 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4269 Info.opc = getOpcForTextureInstr(Intrinsic); 4270 Info.memVT = MVT::v4i32; 4271 Info.ptrVal = nullptr; 4272 Info.offset = 0; 4273 Info.flags = MachineMemOperand::MOLoad; 4274 Info.align = Align(16); 4275 return true; 4276 4277 case Intrinsic::nvvm_suld_1d_i8_clamp: 4278 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4279 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4280 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4281 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4282 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4283 case Intrinsic::nvvm_suld_2d_i8_clamp: 4284 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4285 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4286 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4287 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4288 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4289 case Intrinsic::nvvm_suld_3d_i8_clamp: 4290 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4291 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4292 case Intrinsic::nvvm_suld_1d_i8_trap: 4293 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4294 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4295 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4296 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4297 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4298 case Intrinsic::nvvm_suld_2d_i8_trap: 4299 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4300 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4301 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4302 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4303 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4304 case Intrinsic::nvvm_suld_3d_i8_trap: 4305 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4306 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4307 case Intrinsic::nvvm_suld_1d_i8_zero: 4308 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4309 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4310 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4311 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4312 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4313 case Intrinsic::nvvm_suld_2d_i8_zero: 4314 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4315 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4316 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4317 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4318 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4319 case Intrinsic::nvvm_suld_3d_i8_zero: 4320 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4321 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4322 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4323 Info.memVT = MVT::i8; 4324 Info.ptrVal = nullptr; 4325 Info.offset = 0; 4326 Info.flags = MachineMemOperand::MOLoad; 4327 Info.align = Align(16); 4328 return true; 4329 4330 case Intrinsic::nvvm_suld_1d_i16_clamp: 4331 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4332 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4333 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4334 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4335 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4336 case Intrinsic::nvvm_suld_2d_i16_clamp: 4337 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4338 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4339 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4340 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4341 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4342 case Intrinsic::nvvm_suld_3d_i16_clamp: 4343 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4344 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4345 case Intrinsic::nvvm_suld_1d_i16_trap: 4346 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4347 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4348 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4349 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4350 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4351 case Intrinsic::nvvm_suld_2d_i16_trap: 4352 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4353 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4354 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4355 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4356 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4357 case Intrinsic::nvvm_suld_3d_i16_trap: 4358 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4359 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4360 case Intrinsic::nvvm_suld_1d_i16_zero: 4361 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4362 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4363 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4364 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4365 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4366 case Intrinsic::nvvm_suld_2d_i16_zero: 4367 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4368 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4369 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4370 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4371 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4372 case Intrinsic::nvvm_suld_3d_i16_zero: 4373 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4374 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4375 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4376 Info.memVT = MVT::i16; 4377 Info.ptrVal = nullptr; 4378 Info.offset = 0; 4379 Info.flags = MachineMemOperand::MOLoad; 4380 Info.align = Align(16); 4381 return true; 4382 4383 case Intrinsic::nvvm_suld_1d_i32_clamp: 4384 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4385 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4386 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4387 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4388 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4389 case Intrinsic::nvvm_suld_2d_i32_clamp: 4390 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4391 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4392 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4393 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4394 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4395 case Intrinsic::nvvm_suld_3d_i32_clamp: 4396 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4397 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4398 case Intrinsic::nvvm_suld_1d_i32_trap: 4399 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4400 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4401 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4402 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4403 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4404 case Intrinsic::nvvm_suld_2d_i32_trap: 4405 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4406 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4407 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4408 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4409 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4410 case Intrinsic::nvvm_suld_3d_i32_trap: 4411 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4412 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4413 case Intrinsic::nvvm_suld_1d_i32_zero: 4414 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4415 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4416 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4417 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4418 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4419 case Intrinsic::nvvm_suld_2d_i32_zero: 4420 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4421 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4422 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4423 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4424 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4425 case Intrinsic::nvvm_suld_3d_i32_zero: 4426 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4427 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4428 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4429 Info.memVT = MVT::i32; 4430 Info.ptrVal = nullptr; 4431 Info.offset = 0; 4432 Info.flags = MachineMemOperand::MOLoad; 4433 Info.align = Align(16); 4434 return true; 4435 4436 case Intrinsic::nvvm_suld_1d_i64_clamp: 4437 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4438 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4439 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4440 case Intrinsic::nvvm_suld_2d_i64_clamp: 4441 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4442 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4443 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4444 case Intrinsic::nvvm_suld_3d_i64_clamp: 4445 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4446 case Intrinsic::nvvm_suld_1d_i64_trap: 4447 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4448 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4449 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4450 case Intrinsic::nvvm_suld_2d_i64_trap: 4451 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4452 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4453 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4454 case Intrinsic::nvvm_suld_3d_i64_trap: 4455 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4456 case Intrinsic::nvvm_suld_1d_i64_zero: 4457 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4458 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4459 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4460 case Intrinsic::nvvm_suld_2d_i64_zero: 4461 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4462 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4463 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4464 case Intrinsic::nvvm_suld_3d_i64_zero: 4465 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4466 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4467 Info.memVT = MVT::i64; 4468 Info.ptrVal = nullptr; 4469 Info.offset = 0; 4470 Info.flags = MachineMemOperand::MOLoad; 4471 Info.align = Align(16); 4472 return true; 4473 } 4474 return false; 4475 } 4476 4477 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4478 /// .param space, we may want to increase their alignment in a way that 4479 /// ensures that we can effectively vectorize their loads & stores. We can 4480 /// increase alignment only if the function has internal or has private 4481 /// linkage as for other linkage types callers may already rely on default 4482 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4483 /// ensures that alignment is 16 or greater. 4484 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4485 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4486 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4487 4488 // If a function has linkage different from internal or private, we 4489 // must use default ABI alignment as external users rely on it. Same 4490 // for a function that may be called from a function pointer. 4491 if (!F || !F->hasLocalLinkage() || 4492 F->hasAddressTaken(/*Users=*/nullptr, 4493 /*IgnoreCallbackUses=*/false, 4494 /*IgnoreAssumeLikeCalls=*/true, 4495 /*IgnoreLLVMUsed=*/true)) 4496 return Align(ABITypeAlign); 4497 4498 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4499 return Align(std::max(uint64_t(16), ABITypeAlign)); 4500 } 4501 4502 /// Helper for computing alignment of a device function byval parameter. 4503 Align NVPTXTargetLowering::getFunctionByValParamAlign( 4504 const Function *F, Type *ArgTy, Align InitialAlign, 4505 const DataLayout &DL) const { 4506 Align ArgAlign = InitialAlign; 4507 // Try to increase alignment to enhance vectorization options. 4508 if (F) 4509 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 4510 4511 // Work around a bug in ptxas. When PTX code takes address of 4512 // byval parameter with alignment < 4, ptxas generates code to 4513 // spill argument into memory. Alas on sm_50+ ptxas generates 4514 // SASS code that fails with misaligned access. To work around 4515 // the problem, make sure that we align byval parameters by at 4516 // least 4. 4517 // TODO: this will need to be undone when we get to support multi-TU 4518 // device-side compilation as it breaks ABI compatibility with nvcc. 4519 // Hopefully ptxas bug is fixed by then. 4520 ArgAlign = std::max(ArgAlign, Align(4)); 4521 4522 return ArgAlign; 4523 } 4524 4525 /// isLegalAddressingMode - Return true if the addressing mode represented 4526 /// by AM is legal for this target, for a load/store of the specified type. 4527 /// Used to guide target specific optimizations, like loop strength reduction 4528 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4529 /// (CodeGenPrepare.cpp) 4530 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4531 const AddrMode &AM, Type *Ty, 4532 unsigned AS, Instruction *I) const { 4533 // AddrMode - This represents an addressing mode of: 4534 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4535 // 4536 // The legal address modes are 4537 // - [avar] 4538 // - [areg] 4539 // - [areg+immoff] 4540 // - [immAddr] 4541 4542 if (AM.BaseGV) { 4543 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4544 } 4545 4546 switch (AM.Scale) { 4547 case 0: // "r", "r+i" or "i" is allowed 4548 break; 4549 case 1: 4550 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4551 return false; 4552 // Otherwise we have r+i. 4553 break; 4554 default: 4555 // No scale > 1 is allowed 4556 return false; 4557 } 4558 return true; 4559 } 4560 4561 //===----------------------------------------------------------------------===// 4562 // NVPTX Inline Assembly Support 4563 //===----------------------------------------------------------------------===// 4564 4565 /// getConstraintType - Given a constraint letter, return the type of 4566 /// constraint it is for this target. 4567 NVPTXTargetLowering::ConstraintType 4568 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4569 if (Constraint.size() == 1) { 4570 switch (Constraint[0]) { 4571 default: 4572 break; 4573 case 'b': 4574 case 'r': 4575 case 'h': 4576 case 'c': 4577 case 'l': 4578 case 'f': 4579 case 'd': 4580 case '0': 4581 case 'N': 4582 return C_RegisterClass; 4583 } 4584 } 4585 return TargetLowering::getConstraintType(Constraint); 4586 } 4587 4588 std::pair<unsigned, const TargetRegisterClass *> 4589 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4590 StringRef Constraint, 4591 MVT VT) const { 4592 if (Constraint.size() == 1) { 4593 switch (Constraint[0]) { 4594 case 'b': 4595 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4596 case 'c': 4597 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4598 case 'h': 4599 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4600 case 'r': 4601 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4602 case 'l': 4603 case 'N': 4604 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4605 case 'f': 4606 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4607 case 'd': 4608 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4609 } 4610 } 4611 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4612 } 4613 4614 //===----------------------------------------------------------------------===// 4615 // NVPTX DAG Combining 4616 //===----------------------------------------------------------------------===// 4617 4618 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4619 CodeGenOpt::Level OptLevel) const { 4620 // Always honor command-line argument 4621 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4622 return FMAContractLevelOpt > 0; 4623 4624 // Do not contract if we're not optimizing the code. 4625 if (OptLevel == 0) 4626 return false; 4627 4628 // Honor TargetOptions flags that explicitly say fusion is okay. 4629 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4630 return true; 4631 4632 return allowUnsafeFPMath(MF); 4633 } 4634 4635 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4636 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4637 if (MF.getTarget().Options.UnsafeFPMath) 4638 return true; 4639 4640 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4641 const Function &F = MF.getFunction(); 4642 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 4643 } 4644 4645 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4646 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4647 /// called with the default operands, and if that fails, with commuted 4648 /// operands. 4649 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4650 TargetLowering::DAGCombinerInfo &DCI, 4651 const NVPTXSubtarget &Subtarget, 4652 CodeGenOpt::Level OptLevel) { 4653 SelectionDAG &DAG = DCI.DAG; 4654 // Skip non-integer, non-scalar case 4655 EVT VT=N0.getValueType(); 4656 if (VT.isVector()) 4657 return SDValue(); 4658 4659 // fold (add (mul a, b), c) -> (mad a, b, c) 4660 // 4661 if (N0.getOpcode() == ISD::MUL) { 4662 assert (VT.isInteger()); 4663 // For integer: 4664 // Since integer multiply-add costs the same as integer multiply 4665 // but is more costly than integer add, do the fusion only when 4666 // the mul is only used in the add. 4667 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4668 !N0.getNode()->hasOneUse()) 4669 return SDValue(); 4670 4671 // Do the folding 4672 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4673 N0.getOperand(0), N0.getOperand(1), N1); 4674 } 4675 else if (N0.getOpcode() == ISD::FMUL) { 4676 if (VT == MVT::f32 || VT == MVT::f64) { 4677 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4678 &DAG.getTargetLoweringInfo()); 4679 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4680 return SDValue(); 4681 4682 // For floating point: 4683 // Do the fusion only when the mul has less than 5 uses and all 4684 // are add. 4685 // The heuristic is that if a use is not an add, then that use 4686 // cannot be fused into fma, therefore mul is still needed anyway. 4687 // If there are more than 4 uses, even if they are all add, fusing 4688 // them will increase register pressue. 4689 // 4690 int numUses = 0; 4691 int nonAddCount = 0; 4692 for (const SDNode *User : N0.getNode()->uses()) { 4693 numUses++; 4694 if (User->getOpcode() != ISD::FADD) 4695 ++nonAddCount; 4696 } 4697 if (numUses >= 5) 4698 return SDValue(); 4699 if (nonAddCount) { 4700 int orderNo = N->getIROrder(); 4701 int orderNo2 = N0.getNode()->getIROrder(); 4702 // simple heuristics here for considering potential register 4703 // pressure, the logics here is that the differnce are used 4704 // to measure the distance between def and use, the longer distance 4705 // more likely cause register pressure. 4706 if (orderNo - orderNo2 < 500) 4707 return SDValue(); 4708 4709 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4710 // which guarantees that the FMA will not increase register pressure at node N. 4711 bool opIsLive = false; 4712 const SDNode *left = N0.getOperand(0).getNode(); 4713 const SDNode *right = N0.getOperand(1).getNode(); 4714 4715 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4716 opIsLive = true; 4717 4718 if (!opIsLive) 4719 for (const SDNode *User : left->uses()) { 4720 int orderNo3 = User->getIROrder(); 4721 if (orderNo3 > orderNo) { 4722 opIsLive = true; 4723 break; 4724 } 4725 } 4726 4727 if (!opIsLive) 4728 for (const SDNode *User : right->uses()) { 4729 int orderNo3 = User->getIROrder(); 4730 if (orderNo3 > orderNo) { 4731 opIsLive = true; 4732 break; 4733 } 4734 } 4735 4736 if (!opIsLive) 4737 return SDValue(); 4738 } 4739 4740 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4741 N0.getOperand(0), N0.getOperand(1), N1); 4742 } 4743 } 4744 4745 return SDValue(); 4746 } 4747 4748 static SDValue PerformStoreRetvalCombine(SDNode *N) { 4749 // Operands from the 2nd to the last one are the values to be stored 4750 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 4751 if (!N->getOperand(I).isUndef()) 4752 return SDValue(); 4753 4754 // Operand 0 is the previous value in the chain. Cannot return EntryToken 4755 // as the previous value will become unused and eliminated later. 4756 return N->getOperand(0); 4757 } 4758 4759 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4760 /// 4761 static SDValue PerformADDCombine(SDNode *N, 4762 TargetLowering::DAGCombinerInfo &DCI, 4763 const NVPTXSubtarget &Subtarget, 4764 CodeGenOpt::Level OptLevel) { 4765 SDValue N0 = N->getOperand(0); 4766 SDValue N1 = N->getOperand(1); 4767 4768 // First try with the default operand order. 4769 if (SDValue Result = 4770 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4771 return Result; 4772 4773 // If that didn't work, try again with the operands commuted. 4774 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4775 } 4776 4777 static SDValue PerformANDCombine(SDNode *N, 4778 TargetLowering::DAGCombinerInfo &DCI) { 4779 // The type legalizer turns a vector load of i8 values into a zextload to i16 4780 // registers, optionally ANY_EXTENDs it (if target type is integer), 4781 // and ANDs off the high 8 bits. Since we turn this load into a 4782 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4783 // nodes. Do that here. 4784 SDValue Val = N->getOperand(0); 4785 SDValue Mask = N->getOperand(1); 4786 4787 if (isa<ConstantSDNode>(Val)) { 4788 std::swap(Val, Mask); 4789 } 4790 4791 SDValue AExt; 4792 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4793 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4794 AExt = Val; 4795 Val = Val->getOperand(0); 4796 } 4797 4798 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4799 Val = Val->getOperand(0); 4800 } 4801 4802 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4803 Val->getOpcode() == NVPTXISD::LoadV4) { 4804 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4805 if (!MaskCnst) { 4806 // Not an AND with a constant 4807 return SDValue(); 4808 } 4809 4810 uint64_t MaskVal = MaskCnst->getZExtValue(); 4811 if (MaskVal != 0xff) { 4812 // Not an AND that chops off top 8 bits 4813 return SDValue(); 4814 } 4815 4816 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4817 if (!Mem) { 4818 // Not a MemSDNode?!? 4819 return SDValue(); 4820 } 4821 4822 EVT MemVT = Mem->getMemoryVT(); 4823 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4824 // We only handle the i8 case 4825 return SDValue(); 4826 } 4827 4828 unsigned ExtType = 4829 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4830 getZExtValue(); 4831 if (ExtType == ISD::SEXTLOAD) { 4832 // If for some reason the load is a sextload, the and is needed to zero 4833 // out the high 8 bits 4834 return SDValue(); 4835 } 4836 4837 bool AddTo = false; 4838 if (AExt.getNode() != nullptr) { 4839 // Re-insert the ext as a zext. 4840 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4841 AExt.getValueType(), Val); 4842 AddTo = true; 4843 } 4844 4845 // If we get here, the AND is unnecessary. Just replace it with the load 4846 DCI.CombineTo(N, Val, AddTo); 4847 } 4848 4849 return SDValue(); 4850 } 4851 4852 static SDValue PerformREMCombine(SDNode *N, 4853 TargetLowering::DAGCombinerInfo &DCI, 4854 CodeGenOpt::Level OptLevel) { 4855 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4856 4857 // Don't do anything at less than -O2. 4858 if (OptLevel < CodeGenOpt::Default) 4859 return SDValue(); 4860 4861 SelectionDAG &DAG = DCI.DAG; 4862 SDLoc DL(N); 4863 EVT VT = N->getValueType(0); 4864 bool IsSigned = N->getOpcode() == ISD::SREM; 4865 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4866 4867 const SDValue &Num = N->getOperand(0); 4868 const SDValue &Den = N->getOperand(1); 4869 4870 for (const SDNode *U : Num->uses()) { 4871 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4872 U->getOperand(1) == Den) { 4873 // Num % Den -> Num - (Num / Den) * Den 4874 return DAG.getNode(ISD::SUB, DL, VT, Num, 4875 DAG.getNode(ISD::MUL, DL, VT, 4876 DAG.getNode(DivOpc, DL, VT, Num, Den), 4877 Den)); 4878 } 4879 } 4880 return SDValue(); 4881 } 4882 4883 enum OperandSignedness { 4884 Signed = 0, 4885 Unsigned, 4886 Unknown 4887 }; 4888 4889 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4890 /// that can be demoted to \p OptSize bits without loss of information. The 4891 /// signedness of the operand, if determinable, is placed in \p S. 4892 static bool IsMulWideOperandDemotable(SDValue Op, 4893 unsigned OptSize, 4894 OperandSignedness &S) { 4895 S = Unknown; 4896 4897 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4898 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4899 EVT OrigVT = Op.getOperand(0).getValueType(); 4900 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4901 S = Signed; 4902 return true; 4903 } 4904 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4905 EVT OrigVT = Op.getOperand(0).getValueType(); 4906 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4907 S = Unsigned; 4908 return true; 4909 } 4910 } 4911 4912 return false; 4913 } 4914 4915 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4916 /// be demoted to \p OptSize bits without loss of information. If the operands 4917 /// contain a constant, it should appear as the RHS operand. The signedness of 4918 /// the operands is placed in \p IsSigned. 4919 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4920 unsigned OptSize, 4921 bool &IsSigned) { 4922 OperandSignedness LHSSign; 4923 4924 // The LHS operand must be a demotable op 4925 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4926 return false; 4927 4928 // We should have been able to determine the signedness from the LHS 4929 if (LHSSign == Unknown) 4930 return false; 4931 4932 IsSigned = (LHSSign == Signed); 4933 4934 // The RHS can be a demotable op or a constant 4935 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4936 const APInt &Val = CI->getAPIntValue(); 4937 if (LHSSign == Unsigned) { 4938 return Val.isIntN(OptSize); 4939 } else { 4940 return Val.isSignedIntN(OptSize); 4941 } 4942 } else { 4943 OperandSignedness RHSSign; 4944 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4945 return false; 4946 4947 return LHSSign == RHSSign; 4948 } 4949 } 4950 4951 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4952 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4953 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4954 /// amount. 4955 static SDValue TryMULWIDECombine(SDNode *N, 4956 TargetLowering::DAGCombinerInfo &DCI) { 4957 EVT MulType = N->getValueType(0); 4958 if (MulType != MVT::i32 && MulType != MVT::i64) { 4959 return SDValue(); 4960 } 4961 4962 SDLoc DL(N); 4963 unsigned OptSize = MulType.getSizeInBits() >> 1; 4964 SDValue LHS = N->getOperand(0); 4965 SDValue RHS = N->getOperand(1); 4966 4967 // Canonicalize the multiply so the constant (if any) is on the right 4968 if (N->getOpcode() == ISD::MUL) { 4969 if (isa<ConstantSDNode>(LHS)) { 4970 std::swap(LHS, RHS); 4971 } 4972 } 4973 4974 // If we have a SHL, determine the actual multiply amount 4975 if (N->getOpcode() == ISD::SHL) { 4976 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4977 if (!ShlRHS) { 4978 return SDValue(); 4979 } 4980 4981 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4982 unsigned BitWidth = MulType.getSizeInBits(); 4983 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4984 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4985 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4986 } else { 4987 return SDValue(); 4988 } 4989 } 4990 4991 bool Signed; 4992 // Verify that our operands are demotable 4993 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4994 return SDValue(); 4995 } 4996 4997 EVT DemotedVT; 4998 if (MulType == MVT::i32) { 4999 DemotedVT = MVT::i16; 5000 } else { 5001 DemotedVT = MVT::i32; 5002 } 5003 5004 // Truncate the operands to the correct size. Note that these are just for 5005 // type consistency and will (likely) be eliminated in later phases. 5006 SDValue TruncLHS = 5007 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5008 SDValue TruncRHS = 5009 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5010 5011 unsigned Opc; 5012 if (Signed) { 5013 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5014 } else { 5015 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5016 } 5017 5018 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5019 } 5020 5021 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5022 static SDValue PerformMULCombine(SDNode *N, 5023 TargetLowering::DAGCombinerInfo &DCI, 5024 CodeGenOpt::Level OptLevel) { 5025 if (OptLevel > 0) { 5026 // Try mul.wide combining at OptLevel > 0 5027 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5028 return Ret; 5029 } 5030 5031 return SDValue(); 5032 } 5033 5034 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5035 static SDValue PerformSHLCombine(SDNode *N, 5036 TargetLowering::DAGCombinerInfo &DCI, 5037 CodeGenOpt::Level OptLevel) { 5038 if (OptLevel > 0) { 5039 // Try mul.wide combining at OptLevel > 0 5040 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5041 return Ret; 5042 } 5043 5044 return SDValue(); 5045 } 5046 5047 static SDValue PerformSETCCCombine(SDNode *N, 5048 TargetLowering::DAGCombinerInfo &DCI) { 5049 EVT CCType = N->getValueType(0); 5050 SDValue A = N->getOperand(0); 5051 SDValue B = N->getOperand(1); 5052 5053 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 5054 return SDValue(); 5055 5056 SDLoc DL(N); 5057 // setp.f16x2 returns two scalar predicates, which we need to 5058 // convert back to v2i1. The returned result will be scalarized by 5059 // the legalizer, but the comparison will remain a single vector 5060 // instruction. 5061 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 5062 DCI.DAG.getVTList(MVT::i1, MVT::i1), 5063 {A, B, N->getOperand(2)}); 5064 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5065 CCNode.getValue(1)); 5066 } 5067 5068 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 5069 DAGCombinerInfo &DCI) const { 5070 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 5071 switch (N->getOpcode()) { 5072 default: break; 5073 case ISD::ADD: 5074 case ISD::FADD: 5075 return PerformADDCombine(N, DCI, STI, OptLevel); 5076 case ISD::MUL: 5077 return PerformMULCombine(N, DCI, OptLevel); 5078 case ISD::SHL: 5079 return PerformSHLCombine(N, DCI, OptLevel); 5080 case ISD::AND: 5081 return PerformANDCombine(N, DCI); 5082 case ISD::UREM: 5083 case ISD::SREM: 5084 return PerformREMCombine(N, DCI, OptLevel); 5085 case ISD::SETCC: 5086 return PerformSETCCCombine(N, DCI); 5087 case NVPTXISD::StoreRetval: 5088 case NVPTXISD::StoreRetvalV2: 5089 case NVPTXISD::StoreRetvalV4: 5090 return PerformStoreRetvalCombine(N); 5091 } 5092 return SDValue(); 5093 } 5094 5095 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5096 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 5097 SmallVectorImpl<SDValue> &Results) { 5098 EVT ResVT = N->getValueType(0); 5099 SDLoc DL(N); 5100 5101 assert(ResVT.isVector() && "Vector load must have vector type"); 5102 5103 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 5104 // legal. We can (and should) split that into 2 loads of <2 x double> here 5105 // but I'm leaving that as a TODO for now. 5106 assert(ResVT.isSimple() && "Can only handle simple types"); 5107 switch (ResVT.getSimpleVT().SimpleTy) { 5108 default: 5109 return; 5110 case MVT::v2i8: 5111 case MVT::v2i16: 5112 case MVT::v2i32: 5113 case MVT::v2i64: 5114 case MVT::v2f16: 5115 case MVT::v2f32: 5116 case MVT::v2f64: 5117 case MVT::v4i8: 5118 case MVT::v4i16: 5119 case MVT::v4i32: 5120 case MVT::v4f16: 5121 case MVT::v4f32: 5122 case MVT::v8f16: // <4 x f16x2> 5123 // This is a "native" vector type 5124 break; 5125 } 5126 5127 LoadSDNode *LD = cast<LoadSDNode>(N); 5128 5129 Align Alignment = LD->getAlign(); 5130 auto &TD = DAG.getDataLayout(); 5131 Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext())); 5132 if (Alignment < PrefAlign) { 5133 // This load is not sufficiently aligned, so bail out and let this vector 5134 // load be scalarized. Note that we may still be able to emit smaller 5135 // vector loads. For example, if we are loading a <4 x float> with an 5136 // alignment of 8, this check will fail but the legalizer will try again 5137 // with 2 x <2 x float>, which will succeed with an alignment of 8. 5138 return; 5139 } 5140 5141 EVT EltVT = ResVT.getVectorElementType(); 5142 unsigned NumElts = ResVT.getVectorNumElements(); 5143 5144 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5145 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5146 // loaded type to i16 and propagate the "real" type as the memory type. 5147 bool NeedTrunc = false; 5148 if (EltVT.getSizeInBits() < 16) { 5149 EltVT = MVT::i16; 5150 NeedTrunc = true; 5151 } 5152 5153 unsigned Opcode = 0; 5154 SDVTList LdResVTs; 5155 bool LoadF16x2 = false; 5156 5157 switch (NumElts) { 5158 default: 5159 return; 5160 case 2: 5161 Opcode = NVPTXISD::LoadV2; 5162 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5163 break; 5164 case 4: { 5165 Opcode = NVPTXISD::LoadV4; 5166 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5167 LdResVTs = DAG.getVTList(ListVTs); 5168 break; 5169 } 5170 case 8: { 5171 // v8f16 is a special case. PTX doesn't have ld.v8.f16 5172 // instruction. Instead, we split the vector into v2f16 chunks and 5173 // load them with ld.v4.b32. 5174 assert((EltVT == MVT::f16 || EltVT == MVT::bf16) && 5175 "Unsupported v8 vector type."); 5176 LoadF16x2 = true; 5177 Opcode = NVPTXISD::LoadV4; 5178 EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16; 5179 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 5180 LdResVTs = DAG.getVTList(ListVTs); 5181 break; 5182 } 5183 } 5184 5185 // Copy regular operands 5186 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5187 5188 // The select routine does not have access to the LoadSDNode instance, so 5189 // pass along the extension information 5190 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5191 5192 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5193 LD->getMemoryVT(), 5194 LD->getMemOperand()); 5195 5196 SmallVector<SDValue, 8> ScalarRes; 5197 if (LoadF16x2) { 5198 // Split v2f16 subvectors back into individual elements. 5199 NumElts /= 2; 5200 for (unsigned i = 0; i < NumElts; ++i) { 5201 SDValue SubVector = NewLD.getValue(i); 5202 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5203 DAG.getIntPtrConstant(0, DL)); 5204 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5205 DAG.getIntPtrConstant(1, DL)); 5206 ScalarRes.push_back(E0); 5207 ScalarRes.push_back(E1); 5208 } 5209 } else { 5210 for (unsigned i = 0; i < NumElts; ++i) { 5211 SDValue Res = NewLD.getValue(i); 5212 if (NeedTrunc) 5213 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5214 ScalarRes.push_back(Res); 5215 } 5216 } 5217 5218 SDValue LoadChain = NewLD.getValue(NumElts); 5219 5220 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5221 5222 Results.push_back(BuildVec); 5223 Results.push_back(LoadChain); 5224 } 5225 5226 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5227 SmallVectorImpl<SDValue> &Results) { 5228 SDValue Chain = N->getOperand(0); 5229 SDValue Intrin = N->getOperand(1); 5230 SDLoc DL(N); 5231 5232 // Get the intrinsic ID 5233 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 5234 switch (IntrinNo) { 5235 default: 5236 return; 5237 case Intrinsic::nvvm_ldg_global_i: 5238 case Intrinsic::nvvm_ldg_global_f: 5239 case Intrinsic::nvvm_ldg_global_p: 5240 case Intrinsic::nvvm_ldu_global_i: 5241 case Intrinsic::nvvm_ldu_global_f: 5242 case Intrinsic::nvvm_ldu_global_p: { 5243 EVT ResVT = N->getValueType(0); 5244 5245 if (ResVT.isVector()) { 5246 // Vector LDG/LDU 5247 5248 unsigned NumElts = ResVT.getVectorNumElements(); 5249 EVT EltVT = ResVT.getVectorElementType(); 5250 5251 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5252 // legalization. 5253 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5254 // loaded type to i16 and propagate the "real" type as the memory type. 5255 bool NeedTrunc = false; 5256 if (EltVT.getSizeInBits() < 16) { 5257 EltVT = MVT::i16; 5258 NeedTrunc = true; 5259 } 5260 5261 unsigned Opcode = 0; 5262 SDVTList LdResVTs; 5263 5264 switch (NumElts) { 5265 default: 5266 return; 5267 case 2: 5268 switch (IntrinNo) { 5269 default: 5270 return; 5271 case Intrinsic::nvvm_ldg_global_i: 5272 case Intrinsic::nvvm_ldg_global_f: 5273 case Intrinsic::nvvm_ldg_global_p: 5274 Opcode = NVPTXISD::LDGV2; 5275 break; 5276 case Intrinsic::nvvm_ldu_global_i: 5277 case Intrinsic::nvvm_ldu_global_f: 5278 case Intrinsic::nvvm_ldu_global_p: 5279 Opcode = NVPTXISD::LDUV2; 5280 break; 5281 } 5282 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5283 break; 5284 case 4: { 5285 switch (IntrinNo) { 5286 default: 5287 return; 5288 case Intrinsic::nvvm_ldg_global_i: 5289 case Intrinsic::nvvm_ldg_global_f: 5290 case Intrinsic::nvvm_ldg_global_p: 5291 Opcode = NVPTXISD::LDGV4; 5292 break; 5293 case Intrinsic::nvvm_ldu_global_i: 5294 case Intrinsic::nvvm_ldu_global_f: 5295 case Intrinsic::nvvm_ldu_global_p: 5296 Opcode = NVPTXISD::LDUV4; 5297 break; 5298 } 5299 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5300 LdResVTs = DAG.getVTList(ListVTs); 5301 break; 5302 } 5303 } 5304 5305 SmallVector<SDValue, 8> OtherOps; 5306 5307 // Copy regular operands 5308 5309 OtherOps.push_back(Chain); // Chain 5310 // Skip operand 1 (intrinsic ID) 5311 // Others 5312 OtherOps.append(N->op_begin() + 2, N->op_end()); 5313 5314 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5315 5316 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5317 MemSD->getMemoryVT(), 5318 MemSD->getMemOperand()); 5319 5320 SmallVector<SDValue, 4> ScalarRes; 5321 5322 for (unsigned i = 0; i < NumElts; ++i) { 5323 SDValue Res = NewLD.getValue(i); 5324 if (NeedTrunc) 5325 Res = 5326 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5327 ScalarRes.push_back(Res); 5328 } 5329 5330 SDValue LoadChain = NewLD.getValue(NumElts); 5331 5332 SDValue BuildVec = 5333 DAG.getBuildVector(ResVT, DL, ScalarRes); 5334 5335 Results.push_back(BuildVec); 5336 Results.push_back(LoadChain); 5337 } else { 5338 // i8 LDG/LDU 5339 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5340 "Custom handling of non-i8 ldu/ldg?"); 5341 5342 // Just copy all operands as-is 5343 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5344 5345 // Force output to i16 5346 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5347 5348 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5349 5350 // We make sure the memory type is i8, which will be used during isel 5351 // to select the proper instruction. 5352 SDValue NewLD = 5353 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5354 MVT::i8, MemSD->getMemOperand()); 5355 5356 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5357 NewLD.getValue(0))); 5358 Results.push_back(NewLD.getValue(1)); 5359 } 5360 } 5361 } 5362 } 5363 5364 void NVPTXTargetLowering::ReplaceNodeResults( 5365 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5366 switch (N->getOpcode()) { 5367 default: 5368 report_fatal_error("Unhandled custom legalization"); 5369 case ISD::LOAD: 5370 ReplaceLoadVector(N, DAG, Results); 5371 return; 5372 case ISD::INTRINSIC_W_CHAIN: 5373 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5374 return; 5375 } 5376 } 5377 5378 NVPTXTargetLowering::AtomicExpansionKind 5379 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5380 Type *Ty = AI->getValOperand()->getType(); 5381 5382 if (AI->isFloatingPointOperation()) { 5383 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5384 if (Ty->isFloatTy()) 5385 return AtomicExpansionKind::None; 5386 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5387 return AtomicExpansionKind::None; 5388 } 5389 return AtomicExpansionKind::CmpXChg; 5390 } 5391 5392 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 5393 auto ITy = cast<llvm::IntegerType>(Ty); 5394 5395 switch (AI->getOperation()) { 5396 default: 5397 return AtomicExpansionKind::CmpXChg; 5398 case AtomicRMWInst::BinOp::And: 5399 case AtomicRMWInst::BinOp::Or: 5400 case AtomicRMWInst::BinOp::Xor: 5401 case AtomicRMWInst::BinOp::Xchg: 5402 switch (ITy->getBitWidth()) { 5403 case 8: 5404 case 16: 5405 return AtomicExpansionKind::CmpXChg; 5406 case 32: 5407 return AtomicExpansionKind::None; 5408 case 64: 5409 if (STI.hasAtomBitwise64()) 5410 return AtomicExpansionKind::None; 5411 return AtomicExpansionKind::CmpXChg; 5412 default: 5413 llvm_unreachable("unsupported width encountered"); 5414 } 5415 case AtomicRMWInst::BinOp::Add: 5416 case AtomicRMWInst::BinOp::Sub: 5417 case AtomicRMWInst::BinOp::Max: 5418 case AtomicRMWInst::BinOp::Min: 5419 case AtomicRMWInst::BinOp::UMax: 5420 case AtomicRMWInst::BinOp::UMin: 5421 switch (ITy->getBitWidth()) { 5422 case 8: 5423 case 16: 5424 return AtomicExpansionKind::CmpXChg; 5425 case 32: 5426 return AtomicExpansionKind::None; 5427 case 64: 5428 if (STI.hasAtomMinMax64()) 5429 return AtomicExpansionKind::None; 5430 return AtomicExpansionKind::CmpXChg; 5431 default: 5432 llvm_unreachable("unsupported width encountered"); 5433 } 5434 } 5435 5436 return AtomicExpansionKind::CmpXChg; 5437 } 5438 5439 // Pin NVPTXTargetObjectFile's vtables to this file. 5440 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 5441 5442 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5443 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5444 return getDataSection(); 5445 } 5446